In [26]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn import set_config
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error
import warnings
from datetime import datetime
import matplotlib.pyplot as plt
import shap

In [27]:
warnings.filterwarnings('ignore')
set_config(enable_metadata_routing=True)

In [28]:
train_df_final = pd.read_csv('./input/processed/train_df_final_v5.csv')
test_df_final = pd.read_csv('./input/processed/test_df_final_v5.csv')

In [29]:
train_df_final.head()

Unnamed: 0,id,climatology_temp,target,elev,lat,lon,doy_sin,doy_cos,day_of_year,day_length,...,lat_weighted_doy,lat_season_residual,cos_lat_x_day,sin_lat_x_day,temp_anomaly_cos_doy,residual_anomaly,scaled_residual_anomaly,residual_percentile_x_scaled,dp_evening_gauss,column_energy_pca
0,0,-2.707143,-3.992857,115.62,37.9019,127.0607,0.017213,0.999852,1,9.426812,...,0.103841,36.902048,0.789064,0.614311,-2.875764,-0.169048,-0.426195,-0.345253,-2.114475,-22.454044
1,1,-3.646429,-1.653571,115.62,37.9019,127.0607,0.034422,0.999407,2,9.437325,...,0.207682,36.902493,1.578127,1.228623,-2.114817,1.530357,3.083018,2.643526,-2.072166,-23.980122
2,2,-2.694643,-0.005357,115.62,37.9019,127.0607,0.05162,0.998667,3,9.448699,...,0.311522,36.903233,2.367191,1.842934,-2.622689,0.068452,0.159551,0.130331,-2.328495,-25.886439
3,3,-2.501786,-0.898214,115.62,37.9019,127.0607,0.068802,0.99763,4,9.460926,...,0.415363,36.90427,3.156255,2.457245,-1.415685,1.082738,1.840773,1.543743,-0.929263,-16.235648
4,4,-2.625,-1.775,115.62,37.9019,127.0607,0.085965,0.996298,5,9.473997,...,0.519204,36.905602,3.945319,3.071557,-0.763829,1.858333,2.773454,2.36373,-2.580673,-28.857901


# Dataset Splitting

In [30]:
scaler = StandardScaler()

features = [col for col in train_df_final.columns if col not in ['id', 'target']]

X = train_df_final[features].values
y = train_df_final['target'].values
X_test = test_df_final[features].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=32)

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Fit

In [31]:
best_params_xgb = {'n_estimators': 6883, 'learning_rate': 0.020419270915512017, 'max_depth': 5, 'subsample': 0.898780629687246, 'colsample_bytree': 0.5642327039349159, 'min_child_weight': 0.1765471216265702, 'gamma': 0.0005651238065833414, 'reg_alpha': 0.4660141792358717, 'reg_lambda': 2.5927370390857117e-06}

xgb_model = XGBRegressor(**best_params_xgb)
# lgb_model = LGBMRegressor(n_estimators=5000, learning_rate=0.01, max_depth=5, random_state=32)
ridge_model = Ridge(alpha=1.0)

base_models = [
    ('xgb', xgb_model),
    ('ridge', Ridge(alpha=1.0))
]

meta_model = SVR(kernel='rbf', C=10, epsilon=0.1)


stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=True,
    cv=5,
    n_jobs=-1
)


stacked_model.fit(X_train_scaled, y_train)

y_pred = stacked_model.predict(X_val_scaled)
r2 = r2_score(y_val, y_pred)
rmse = root_mean_squared_error(y_val, y_pred)
print(f"\n✅ Stacked R²: {r2:.4f} | RMSE: {rmse:.4f}")


✅ Stacked R²: 0.7934 | RMSE: 1.3485


# Residuals

In [32]:
# xgb = XGBRegressor(**best_params_xgb)
# xgb.fit(X_train, y_train)
#
# explainer = shap.Explainer(xgb, X_train)
# shap_values = explainer(X_train)
#
# mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
# shap_scores = pd.Series(mean_abs_shap, index=features)
# top_50_features = shap_scores.sort_values(ascending=False).head(50).index.tolist()
#
# X_train_top = X_train[top_50_features]
# X_val_top   = X_val[top_50_features]
# X_test_top  = X_test[top_50_features]
#
# xgb_top = XGBRegressor(**best_params_xgb)
# xgb_top.fit(X_train_top, y_train)
#
# residuals = y_train - xgb_top.predict(X_train_top)
# ridge_residual = Ridge(alpha=1.0)
# ridge_residual.fit(X_train_top, residuals)
#
# y_val_pred = xgb_top.predict(X_val_top) + ridge_residual.predict(X_val_top)
# r2 = r2_score(y_val, y_val_pred)
# rmse = root_mean_squared_error(y_val, y_val_pred)
# print(f"\n✅ Final Ensemble R²: {r2:.4f} | RMSE: {rmse:.4f}")

# Predict with Test Dataset

In [33]:
# y_test_pred = ridge_residual.predict(X_test_scaled)
#
# submission = pd.DataFrame({
#     "id": test_df_final["id"],
#     "target": y_test_pred
# })
#
# now = datetime.now().strftime("%Y%m%d_%H%M%S")
#
# submission.to_csv(f'./output/submission_{now}.csv', index=False)