In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn import set_config
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error
import warnings
from datetime import datetime
import matplotlib.pyplot as plt
import shap

In [2]:
warnings.filterwarnings('ignore')
set_config(enable_metadata_routing=True)

In [3]:
train_df_final = pd.read_csv('./input/processed/train_df_final_v5.csv')
test_df_final = pd.read_csv('./input/processed/test_df_final_v5.csv')

In [4]:
train_df_final.head()

Unnamed: 0,id,climatology_temp,target,elev,lat,lon,doy_sin,doy_cos,day_of_year,day_length,...,lat_lon_pressure_combo,humid_cloud_stress_alt,cloud_pressure_ratio,dew_depression_chain,humidity_chain_avg,min_cloud_h_x_lat,cloud_amp_morning,dp_morning_x_climatology,afternoon_night_slope,humidity_gradient_evening_morning
0,0,-2.707143,-3.992857,115.62,37.9019,127.0607,0.017213,0.999852,1,9.426812,...,4970671.0,0.0,0.0,1525.255111,44.666667,2996.30312,317.333333,45.209286,-2.125507,-20.5
1,1,-3.646429,-1.653571,115.62,37.9019,127.0607,0.034422,0.999407,2,9.437325,...,4979019.0,0.0,0.0,1760.479259,41.611111,18096.754686,0.0,60.530714,-7.123462,-19.333333
2,2,-2.694643,-0.005357,115.62,37.9019,127.0607,0.05162,0.998667,3,9.448699,...,4984316.0,0.0,0.0,2136.333444,37.722222,18218.298354,0.0,42.260982,-8.811602,-30.333333
3,3,-2.501786,-0.898214,115.62,37.9019,127.0607,0.068802,0.99763,4,9.460926,...,4959113.0,13.090675,0.001621,244.496389,54.055556,2746.624353,76.888889,36.77625,-1.776812,10.666667
4,4,-2.625,-1.775,115.62,37.9019,127.0607,0.085965,0.996298,5,9.473997,...,4947154.0,0.0,0.0,2944.845037,31.944444,11554.745725,0.0,38.15,-22.541723,-20.5


# Dataset Splitting

In [5]:
scaler = StandardScaler()

features = [col for col in train_df_final.columns if col not in ['id', 'target']]

X = train_df_final[features].values
y = train_df_final['target'].values
X_test = test_df_final[features].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=32)

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Fit

In [6]:
best_params_xgb = {'n_estimators': 7686, 'learning_rate': 0.027367337212669172, 'max_depth': 5, 'subsample': 0.5959315110309751, 'colsample_bytree': 0.901155651474499, 'min_child_weight': 0.0072824350924899, 'gamma': 5.136565597520639e-08, 'reg_alpha': 0.00036520349232711184, 'reg_lambda': 4.6466639093441024}


meta_model = MLPRegressor(
    hidden_layer_sizes=(96, 48),
    activation='relu',
    alpha=1e-4,
    learning_rate='adaptive',
    learning_rate_init=0.005,
    early_stopping=True,
    max_iter=400,
    random_state=42
)

# Base models: deep + wide + smooth
base_models = [
    ('xgb', XGBRegressor(
        **best_params_xgb, n_jobs=-1, tree_method='hist', random_state=42)),
    ('lgb', LGBMRegressor(
        n_estimators=4000, learning_rate=0.008, max_depth=5,
        subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42)),
    ('svr', SVR(C=15, epsilon=0.03, kernel='rbf')),
    ('ridge', RidgeCV(alphas=[0.1, 0.3, 1.0, 3.0]))
]

# Final stacker
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

# Fit and evaluate
stacked_model.fit(X_train_scaled, y_train)
y_pred = stacked_model.predict(X_val_scaled)

r2 = r2_score(y_val, y_pred)
rmse = root_mean_squared_error(y_val, y_pred)
print(f"\n✅ Stacked R²: {r2:.4f} | RMSE: {rmse:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29227
[LightGBM] [Info] Number of data points in the train set: 10505, number of used features: 129
[LightGBM] [Info] Start training from score 0.235319
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012627 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if m
















✅ Stacked R²: 0.8293 | RMSE: 1.2256


# Residuals

In [7]:
# xgb = XGBRegressor(**best_params_xgb)
# xgb.fit(X_train, y_train)
#
# explainer = shap.Explainer(xgb, X_train)
# shap_values = explainer(X_train)
#
# mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
# shap_scores = pd.Series(mean_abs_shap, index=features)
# top_50_features = shap_scores.sort_values(ascending=False).head(50).index.tolist()
#
# X_train_top = X_train[top_50_features]
# X_val_top   = X_val[top_50_features]
# X_test_top  = X_test[top_50_features]
#
# xgb_top = XGBRegressor(**best_params_xgb)
# xgb_top.fit(X_train_top, y_train)
#
# residuals = y_train - xgb_top.predict(X_train_top)
# ridge_residual = Ridge(alpha=1.0)
# ridge_residual.fit(X_train_top, residuals)
#
# y_val_pred = xgb_top.predict(X_val_top) + ridge_residual.predict(X_val_top)
# r2 = r2_score(y_val, y_val_pred)
# rmse = root_mean_squared_error(y_val, y_val_pred)
# print(f"\n✅ Final Ensemble R²: {r2:.4f} | RMSE: {rmse:.4f}")

# Predict with Test Dataset

In [8]:
# y_test_pred = ridge_residual.predict(X_test_scaled)
#
# submission = pd.DataFrame({
#     "id": test_df_final["id"],
#     "target": y_test_pred
# })
#
# now = datetime.now().strftime("%Y%m%d_%H%M%S")
#
# submission.to_csv(f'./output/submission_{now}.csv', index=False)