## Setup

Load libraries:

In [85]:
import os
import torch
import pickle
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

from category_encoders import TargetEncoder
from snowmodels.utils import compare_multiple_models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor


xgb_device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"XGBoost will run on {xgb_device}")

XGBoost will run on cuda


## Load Dataset

In [7]:
with open('../data/data_splits.pkl', 'rb') as f:
    data_splits = pickle.load(f)

In [8]:
X_train_raw = data_splits['X_temp'] # train + val instances
X_test_raw = data_splits['X_test']
y_train = data_splits['y_temp']
y_test = data_splits['y_test']

In [None]:
encoder = TargetEncoder(cols=['Snow_Class'], min_samples_leaf=20, smoothing=10)

X_cat_train = encoder.fit_transform(X_train_raw['Snow_Class'], y_train)
X_cat_test = encoder.transform(X_test_raw['Snow_Class'])

# Combine with numerical features
X_train_main = pd.concat([X_train_raw[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_train], axis=1)
X_test_main = pd.concat([X_test_raw[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_test], axis=1)

In [63]:
climate_train=(
    X_train_raw
    .filter(items=['Elevation', 'Snow_Depth', 'DOWY', 'TAVG_lag_14d', 'PRECIPITATION_lag_14d', 'Snow_Class', 'SWE'])
    .assign(Snow_Density=y_train)
    .dropna(subset = ['TAVG_lag_14d', 'PRECIPITATION_lag_14d'])
)

climate_test=(
    X_test_raw
    .filter(items=['Elevation', 'Snow_Depth', 'DOWY', 'TAVG_lag_14d', 'PRECIPITATION_lag_14d', 'Snow_Class', 'SWE'])
    .assign(Snow_Density=y_test)
    .dropna(subset = ['TAVG_lag_14d', 'PRECIPITATION_lag_14d'])
)

y_train_climate=climate_train.Snow_Density
y_test_climate=climate_test.Snow_Density

encoder_climate = TargetEncoder(cols=['Snow_Class'], min_samples_leaf=20, smoothing=10)

X_cat_train_climate = encoder_climate.fit_transform(climate_train['Snow_Class'], y_train_climate)
X_cat_test_climate = encoder_climate.transform(climate_test['Snow_Class'])


X_train_climate = pd.concat([climate_train[['Elevation', 'Snow_Depth', 'DOWY','TAVG_lag_14d', 'PRECIPITATION_lag_14d']], X_cat_train_climate], axis=1)
X_test_climate = pd.concat([climate_test[['Elevation', 'Snow_Depth', 'DOWY','TAVG_lag_14d', 'PRECIPITATION_lag_14d']], X_cat_test_climate], axis=1)

### Extra Trees

In [23]:
ext_hyp_from_optuna = {
    'n_estimators': 120,
    'max_depth': None,
    'min_samples_split': 14,
    'min_samples_leaf': 1
  }


ext_trees_main=ExtraTreesRegressor(**ext_hyp_from_optuna, random_state=42, n_jobs=-1)

ext_trees_main.fit(X_train_main, y_train)
y_pred_ext_main = ext_trees_main.predict(X_test_main)

### Random Forest

In [29]:
rf_hyp_from_optuna = {
    'n_estimators': 438,
    'max_depth': None,
    'min_samples_split': 9,
    'min_samples_leaf': 3
  }


rf_trees_main=RandomForestRegressor(**rf_hyp_from_optuna, random_state=42, n_jobs=-1)

rf_trees_main.fit(X_train_main, y_train)
y_pred_rf_main = rf_trees_main.predict(X_test_main)

### LightGBM

In [None]:
params_from_optuna={
    'objective': 'regression',
    'random_state':42,
    'learning_rate': 0.13597606223994746,
    'num_leaves': 185,
    'min_child_weight': 20.535024680037342,
    'min_child_samples': 22,
    'max_bin': 764,
    'max_depth': 20,
    'bagging_fraction': 1
}

train_data = lgb.Dataset(X_train_main, label=y_train,)

lgbm_main = lgb.train(
    params_from_optuna,
    train_data,
    num_boost_round=1500 # best iteration from Optuna
)

y_pred_lgbm_main = lgbm_main.predict(X_test_main)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1246
[LightGBM] [Info] Number of data points in the train set: 2178049, number of used features: 4
[LightGBM] [Info] Start training from score 0.302427


### XGBoost

In [35]:
xgb_params_from_optuna={
    'objective': 'reg:squarederror',
    'random_state':42,
    'learning_rate': 0.012571635462866944,
    'gamma': 1.8331574770840668e-06,
    'min_child_weight': 12.338531236617445,
    'max_bin': 3350,
    'max_depth': 17,
    'subsample': 1,
    'device': xgb_device,
    'tree_method': 'hist',
    'sampling_method': 'gradient_based'
}

dtrain = xgb.DMatrix(X_train_main, label=y_train)
dtest = xgb.DMatrix(X_test_main)

xgb_main = xgb.train(
    xgb_params_from_optuna,
    dtrain,
    num_boost_round=1498, # best iteration from Optuna
)

y_pred_xgb_main = xgb_main.predict(dtest)

## Model Comparison

* Density

In [37]:
main_model_results=pd.DataFrame({
    'EXT_Density': y_pred_ext_main,
    'RF_Density': y_pred_rf_main,
    'LGBM_Density': y_pred_lgbm_main,
    'XGB_Density': y_pred_xgb_main,
    'True_Density': y_test.values
})

main_model_results.head()

Unnamed: 0,EXT_Density,RF_Density,LGBM_Density,XGB_Density,True_Density
0,0.205936,0.187287,0.20334,0.198362,0.190909
1,0.310802,0.298136,0.301686,0.309763,0.319048
2,0.229451,0.222444,0.227774,0.226534,0.209524
3,0.409145,0.407753,0.415537,0.410843,0.450649
4,0.270602,0.278175,0.259346,0.269569,0.273913


In [39]:
compare_multiple_models(main_model_results, 'True_Density')

Unnamed: 0,EXT_Density,RF_Density,LGBM_Density,XGB_Density
RMSE,0.045778,0.043575,0.043445,0.042854
MBE,0.000134,0.000175,0.000104,0.00012
RSQ,0.752531,0.775775,0.777112,0.783138


* SWE

In [42]:
main_model_swe_results=pd.DataFrame({
    'EXT_SWE': main_model_results.EXT_Density.values*X_test_main.Snow_Depth.values,
    'RF_SWE': main_model_results.RF_Density.values*X_test_main.Snow_Depth.values,
    'LGBM_SWE': main_model_results.LGBM_Density.values*X_test_main.Snow_Depth.values,
    'XGB_SWE': main_model_results.XGB_Density.values*X_test_main.Snow_Depth.values,
    'True_SWE': X_test_raw.SWE.values,
})

main_model_swe_results.head()

Unnamed: 0,EXT_SWE,RF_SWE,LGBM_SWE,XGB_SWE,True_SWE
0,17.261557,15.69842,17.043941,16.62668,16.002
1,16.578168,15.902558,16.091957,16.522734,17.018
2,24.477876,23.730374,24.298943,24.166656,22.352
3,80.020597,79.748333,81.270656,80.352758,88.138
4,15.808558,16.25098,15.150965,15.748223,16.002


In [43]:
compare_multiple_models(main_model_swe_results, 'True_SWE')

Unnamed: 0,EXT_SWE,RF_SWE,LGBM_SWE,XGB_SWE
RMSE,4.419633,4.141154,4.132439,4.065121
MBE,0.013464,0.01798,0.008473,0.012441
RSQ,0.975537,0.978523,0.978613,0.979304


* XGBoost for the Win - it did slightly better in density estimation.

## Adding Climatology

### Extra Trees

In [67]:
ext_trees_climate=ExtraTreesRegressor(random_state=42, n_jobs=-1)

ext_trees_climate.fit(X_train_climate, y_train_climate)
y_pred_ext_climate = ext_trees_climate.predict(X_test_climate)

### Random Forest

In [69]:
rf_hyp_from_optuna_climate = {
    'n_estimators': 412,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1
  }


rf_trees_climate=RandomForestRegressor(**rf_hyp_from_optuna_climate, random_state=42, n_jobs=-1)

rf_trees_climate.fit(X_train_climate, y_train_climate)
y_pred_rf_climate = rf_trees_climate.predict(X_test_climate)

### LightGBM

In [72]:
params_from_optuna_climate={
    'objective': 'regression',
    'random_state':42,
    'learning_rate': 0.28206964529038514,
    'num_leaves': 166,
    'min_child_weight': 29.631721026875745,
    'min_child_samples': 25,
    'max_bin': 3128,
    'max_depth': 15,
    'bagging_fraction': 1
}

train_data_climate = lgb.Dataset(X_train_climate, label=y_train_climate)

lgbm_climate = lgb.train(
    params_from_optuna_climate,
    train_data_climate,
    num_boost_round=1500 # best iteration from Optuna
)

y_pred_lgbm_climate = lgbm_climate.predict(X_test_climate)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008968 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5377
[LightGBM] [Info] Number of data points in the train set: 2123709, number of used features: 6
[LightGBM] [Info] Start training from score 0.302762


### XGBoost

In [74]:
xgb_params_from_optun_climate={
    'objective': 'reg:squarederror',
    'random_state':42,
    'learning_rate': 0.08039528136679829,
    'gamma': 7.210110116555584e-07,
    'min_child_weight': 2.8200619931422226,
    'max_bin': 5817,
    'max_depth': 18,
    'subsample': 1,
    'device': xgb_device,
    'tree_method': 'hist',
    'sampling_method': 'gradient_based'
}

dtrain_climate = xgb.DMatrix(X_train_climate, label=y_train_climate)
dtest_climate = xgb.DMatrix(X_test_climate)

xgb_climate = xgb.train(
    xgb_params_from_optun_climate,
    dtrain_climate,
    num_boost_round=1498, # best iteration from Optuna
)

y_pred_xgb_climate = xgb_climate.predict(dtest_climate)

## Model Comparison

In [77]:
climate_density_results=pd.DataFrame({
    'EXT_Density_climate': y_pred_ext_climate,
    'RF_Density_climate': y_pred_rf_climate,
    'LGBM_Density_climate': y_pred_lgbm_climate,
    'XGB_Density_climate': y_pred_xgb_climate,
    'True_Density': y_test_climate.values
}, index=climate_test.index)

climate_density_results.head()

Unnamed: 0,EXT_Density_climate,RF_Density_climate,LGBM_Density_climate,XGB_Density_climate,True_Density
3999061,0.211203,0.206897,0.204054,0.1883,0.190909
6799768,0.286014,0.285066,0.304897,0.298061,0.319048
6324785,0.221815,0.223069,0.231225,0.225484,0.209524
3223533,0.443653,0.446124,0.416292,0.452195,0.450649
5933277,0.267886,0.272453,0.265592,0.278727,0.273913


In [80]:
climate_swe_results=pd.DataFrame({
    'True_SWE': climate_test.SWE.values,
    'EXT_SWE_climate': climate_density_results.EXT_Density_climate.values*X_test_climate.Snow_Depth.values,
    'RF_SWE_climate': climate_density_results.RF_Density_climate.values*X_test_climate.Snow_Depth.values,
    'LGBM_SWE_climate': climate_density_results.LGBM_Density_climate.values*X_test_climate.Snow_Depth.values,
    'XGB_SWE_climate': climate_density_results.XGB_Density_climate.values*X_test_climate.Snow_Depth.values,
}, index=climate_test.index)

climate_swe_results.head()

Unnamed: 0,True_SWE,EXT_SWE_climate,RF_SWE_climate,LGBM_SWE_climate,XGB_SWE_climate
3999061,16.002,17.703037,17.342088,17.103823,15.7833
6799768,17.018,15.255989,15.205407,16.26322,15.898551
6324785,22.352,23.663232,23.796979,24.667089,24.054614
3223533,88.138,86.769575,87.252916,81.418468,88.440389
5933277,16.002,15.649891,15.916689,15.515871,16.283207


In [81]:
compare_multiple_models(climate_density_results, 'True_Density')

Unnamed: 0,EXT_Density_climate,RF_Density_climate,LGBM_Density_climate,XGB_Density_climate
RMSE,0.037842,0.036007,0.037948,0.032268
MBE,0.000386,0.000349,0.000109,0.000144
RSQ,0.830659,0.846687,0.829713,0.876874


In [84]:
compare_multiple_models(climate_swe_results, 'True_SWE')

Unnamed: 0,EXT_SWE_climate,RF_SWE_climate,LGBM_SWE_climate,XGB_SWE_climate
RMSE,3.429781,3.252548,3.463989,2.830543
MBE,0.024725,0.018747,0.006377,0.00446
RSQ,0.985151,0.986646,0.984854,0.989887


### XGBoost Wins!

We'll save the model weights and encoder for later use.

In [86]:
if not os.path.exists('../ML_weights'):
    os.makedirs('../ML_weights')
else:
    print('Folder already exists')

In [87]:
xgb_main.save_model("../ML_weights/main_density_model.ubj") # Non climate
xgb_climate.save_model("../ML_weights/climate_density_model.ubj") # Climate Enhanced

In [92]:
# Save the encoders to a file
with open('../ML_weights/encoder_climate.pkl', 'wb') as f:
    pickle.dump(encoder_climate, f)

with open('../ML_weights/encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

### Store Predictions

In [90]:
main_model_results.drop('True_Density', axis=1).to_parquet('../results/density/ML_models_predictions_baseline.parquet', index=False, compression='gzip')
main_model_swe_results.drop('True_SWE', axis=1).to_parquet('../results/swe/ML_models_predictions_baseline.parquet', index=False, compression='gzip')

In [91]:
climate_density_results.drop('True_Density', axis=1).to_parquet('../results/density/ML_models_predictions_climate.parquet', index=True, compression='gzip')
climate_swe_results.drop('True_SWE', axis=1).to_parquet('../results/swe/ML_models_predictions_climate.parquet', index=True, compression='gzip')

# The End!