In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time

start = time.time()

# --- Load Data ---
stations_data = pd.read_csv('../data/processed_data/CREMP_Stations_2023.csv')
temperatures_data = pd.read_csv('../data/processed_data/CREMP_Temperatures_2023.csv')

# --- Downcast Numeric Columns ---
for df in [stations_data, temperatures_data]:
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, downcast='float')

# --- Convert all object columns to category ---
for df in [stations_data, temperatures_data]:
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')

# --- Impute Missing Values ---
imputer_numeric = SimpleImputer(strategy='mean')
imputer_cat = SimpleImputer(strategy='most_frequent')

for df in [stations_data, temperatures_data]:
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    cat_cols = df.select_dtypes(include=['category']).columns
    df[numeric_cols] = imputer_numeric.fit_transform(df[numeric_cols])
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

# --- Encode Categoricals Safely ---
label_encoder = LabelEncoder()
for df in [stations_data, temperatures_data]:
    cat_cols = df.select_dtypes(include=['category']).columns
    for col in cat_cols:
        df[col] = label_encoder.fit_transform(df[col].astype(str))

# --- Merge Datasets ---
merged_data = pd.merge(stations_data, temperatures_data, on='SiteID', how='inner')

# --- Optional: Sample for Speed ---
merged_data = merged_data.sample(frac=0.5, random_state=42)

# --- Drop Irrelevant or Problematic Columns ---
drop_cols = ['OID_', 'Site_name', 'StationID', 'latDD', 'lonDD', 
             'latDeg', 'latMin', 'lonDeg', 'lonMin', 'row_index']
X = merged_data.drop(columns=['TempC'] + drop_cols, errors='ignore')
y = merged_data['TempC']

# --- Ensure all features are numeric ---
non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) > 0:
    print(f"Dropping non-numeric columns: {list(non_numeric_cols)}")
    X = X.drop(columns=non_numeric_cols)

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Random Forest Model ---
rf_model = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Results:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf)):.4f}")
print(f"R²: {r2_score(y_test, y_pred_rf):.4f}\n")

# --- XGBoost Model ---
xgb_model = XGBRegressor(n_estimators=50, learning_rate=0.1, max_depth=4,
                         subsample=0.8, colsample_bytree=0.8,
                         random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Results:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_xgb):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_xgb)):.4f}")
print(f"R²: {r2_score(y_test, y_pred_xgb):.4f}\n")

print(f"⏱ Total runtime: {time.time() - start:.2f} seconds")


Dropping non-numeric columns: ['Region', 'Site_Code', 'Site_name_x', 'Habitat', 'Subregion', 'Site_name_y']
Random Forest Results:
MAE: 0.1059
RMSE: 0.1460
R²: 0.9976

XGBoost Results:
MAE: 0.0717
RMSE: 0.1316
R²: 0.9980

⏱ Total runtime: 1451.83 seconds
