## Setup

Load libraries:

In [1]:
import sys

sys.path.append('..')

In [38]:
import torch
import pickle
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

from category_encoders import TargetEncoder
from snowmodels.utils import compare_multiple_models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor


xgb_device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"XGBoost will run on {xgb_device}")

XGBoost will run on cuda


## Load Dataset

In [7]:
with open('../data/data_splits.pkl', 'rb') as f:
    data_splits = pickle.load(f)

In [8]:
X_train_raw = data_splits['X_temp'] # train + val instances
X_test_raw = data_splits['X_test']
y_train = data_splits['y_temp']
y_test = data_splits['y_test']

In [10]:
encoder = TargetEncoder(cols=['Snow_Class'], min_samples_leaf=20, smoothing=10)

X_cat_train = encoder.fit_transform(X_train_raw['Snow_Class'], y_train)
X_cat_test = encoder.transform(X_test_raw['Snow_Class'])

# Combine with numerical features
X_train_main = pd.concat([X_train_raw[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_train], axis=1)
X_train_climate = pd.concat([X_train_raw[['Elevation', 'Snow_Depth', 'DOWY','TAVG_lag_14d', 'PRECIPITATION_lag_14d']], X_cat_train], axis=1)


X_test_main = pd.concat([X_test_raw[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_test], axis=1)
X_test_climate = pd.concat([X_test_raw[['Elevation', 'Snow_Depth', 'DOWY','TAVG_lag_14d', 'PRECIPITATION_lag_14d']], X_cat_test], axis=1)

### Extra Trees

In [23]:
ext_hyp_from_optuna = {
    'n_estimators': 120,
    'max_depth': None,
    'min_samples_split': 14,
    'min_samples_leaf': 1
  }


ext_trees_main=ExtraTreesRegressor(**ext_hyp_from_optuna, random_state=42, n_jobs=-1)

ext_trees_main.fit(X_train_main, y_train)
y_pred_ext_main = ext_trees_main.predict(X_test_main)

### Random Forest

In [None]:
{'n_estimators': 438,
  'max_depth': 25,
  'min_samples_split': 9,
  'min_samples_leaf': 3}

In [29]:
rf_hyp_from_optuna = {
    'n_estimators': 438,
    'max_depth': None,
    'min_samples_split': 9,
    'min_samples_leaf': 3
  }


rf_trees_main=RandomForestRegressor(**rf_hyp_from_optuna, random_state=42, n_jobs=-1)

rf_trees_main.fit(X_train_main, y_train)
y_pred_rf_main = rf_trees_main.predict(X_test_main)

### LightGBM

In [None]:
params_from_optuna={
    'objective': 'regression',
    'random_state':42,
    'learning_rate': 0.13597606223994746,
    'num_leaves': 185,
    'min_child_weight': 20.535024680037342,
    'min_child_samples': 22,
    'max_bin': 764,
    'max_depth': 20,
    'bagging_fraction': 1
}

train_data = lgb.Dataset(X_train_main, label=y_train,)

lgbm_main = lgb.train(
    params_from_optuna,
    train_data,
    num_boost_round=1500 # best iteration from Optuna
)

y_pred_lgbm_main = lgbm_main.predict(X_test_main)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1246
[LightGBM] [Info] Number of data points in the train set: 2178049, number of used features: 4
[LightGBM] [Info] Start training from score 0.302427


### XGBoost

In [35]:
xgb_params_from_optuna={
    'objective': 'reg:squarederror',
    'random_state':42,
    'learning_rate': 0.012571635462866944,
    'gamma': 1.8331574770840668e-06,
    'min_child_weight': 12.338531236617445,
    'max_bin': 3350,
    'max_depth': 17,
    'subsample': 1,
    'device': xgb_device,
    'tree_method': 'hist',
    'sampling_method': 'gradient_based'
}

dtrain = xgb.DMatrix(X_train_main, label=y_train)
dtest = xgb.DMatrix(X_test_main)

xgb_main = xgb.train(
    xgb_params_from_optuna,
    dtrain,
    num_boost_round=1498, # best iteration from Optuna
)

y_pred_xgb_main = xgb_main.predict(dtest)

## Model Comparison

* Density

In [37]:
main_model_results=pd.DataFrame({
    'EXT_Density': y_pred_ext_main,
    'RF_Density': y_pred_rf_main,
    'LGBM_Density': y_pred_lgbm_main,
    'XGB_Density': y_pred_xgb_main,
    'True_Density': y_test.values
})

main_model_results.head()

Unnamed: 0,EXT_Density,RF_Density,LGBM_Density,XGB_Density,True_Density
0,0.205936,0.187287,0.20334,0.198362,0.190909
1,0.310802,0.298136,0.301686,0.309763,0.319048
2,0.229451,0.222444,0.227774,0.226534,0.209524
3,0.409145,0.407753,0.415537,0.410843,0.450649
4,0.270602,0.278175,0.259346,0.269569,0.273913


In [39]:
compare_multiple_models(main_model_results, 'True_Density')

Unnamed: 0,EXT_Density,RF_Density,LGBM_Density,XGB_Density
RMSE,0.045778,0.043575,0.043445,0.042854
MBE,0.000134,0.000175,0.000104,0.00012
RSQ,0.752531,0.775775,0.777112,0.783138


* SWE

In [42]:
main_model_swe_results=pd.DataFrame({
    'EXT_SWE': main_model_results.EXT_Density.values*X_test_main.Snow_Depth.values,
    'RF_SWE': main_model_results.RF_Density.values*X_test_main.Snow_Depth.values,
    'LGBM_SWE': main_model_results.LGBM_Density.values*X_test_main.Snow_Depth.values,
    'XGB_SWE': main_model_results.XGB_Density.values*X_test_main.Snow_Depth.values,
    'True_SWE': X_test_raw.SWE.values,
})

main_model_swe_results.head()

Unnamed: 0,EXT_SWE,RF_SWE,LGBM_SWE,XGB_SWE,True_SWE
0,17.261557,15.69842,17.043941,16.62668,16.002
1,16.578168,15.902558,16.091957,16.522734,17.018
2,24.477876,23.730374,24.298943,24.166656,22.352
3,80.020597,79.748333,81.270656,80.352758,88.138
4,15.808558,16.25098,15.150965,15.748223,16.002


In [43]:
compare_multiple_models(main_model_swe_results, 'True_SWE')

Unnamed: 0,EXT_SWE,RF_SWE,LGBM_SWE,XGB_SWE
RMSE,4.419633,4.141154,4.132439,4.065121
MBE,0.013464,0.01798,0.008473,0.012441
RSQ,0.975537,0.978523,0.978613,0.979304


* XGBoost for the Win!

In [20]:
from sklearn.metrics import root_mean_squared_error as rmse, r2_score

In [24]:
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_pred_ext_main)}\nR²: {r2_score(y_true=y_test, y_pred=y_pred_ext_main)}")

RMSE: 0.04577819968806074
R²: 0.7525312813368727


In [30]:
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_pred_rf_main)}\nR²: {r2_score(y_true=y_test, y_pred=y_pred_rf_main)}")

RMSE: 0.043575322947233416
R²: 0.775774954883345


In [34]:
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_pred_lgbm_main)}\nR²: {r2_score(y_true=y_test, y_pred=y_pred_lgbm_main)}")

RMSE: 0.043445239409770284
R²: 0.7771116954106951


In [36]:
print(f"RMSE: {rmse(y_true=y_test, y_pred=y_pred_xgb_main)}\nR²: {r2_score(y_true=y_test, y_pred=y_pred_xgb_main)}")

RMSE: 0.04285386266912323
R²: 0.7831383098463478
