## Setup

Load libraries

In [1]:
import sys

sys.path.append('..')

In [9]:
import pickle
import optuna
import pandas as pd
import category_encoders as ce
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder
from snowmodels.utils import DefaultTuner, ComprehensiveOptimizer
from sklearn.metrics import root_mean_squared_error as rmse, r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor


set_config(transform_output="pandas")
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [3]:
with open('../data/data_splits.pkl', 'rb') as f:
    data_splits = pickle.load(f)

## Default Hyperparameters

We'll start by running all four models using their default hyperparamaters. This is essential to see if there is any performance gains from tuning.

In [4]:
X_train = data_splits['X_train']
X_val = data_splits['X_val']
y_train = data_splits['y_train']
y_val = data_splits['y_val']

In [7]:
default_tuner = DefaultTuner(
        X_train=X_train,
        X_val=X_val,
        y_train=y_train,
        y_val=y_val
    )

baseline_results = default_tuner.run_default_models()


Running baseline models with default parameters...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 738
[LightGBM] [Info] Number of data points in the train set: 1905792, number of used features: 9
[LightGBM] [Info] Start training from score 0.302447
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	train's rmse: 0.0483899	valid's rmse: 0.0488552

Results with onehot encoder:
LightGBM - R²: 0.7199, RMSE: 0.0489, Best iteration: 1000
XGBoost - R²: 0.7362, RMSE: 0.0474, Best iteration: 999
ExtraTrees - R²: 0.6734, RMSE: 0.0528
RandomForest - R²: 0.7430, RMSE: 0.0468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004291 seconds.
You can set `force_row_wise=true` to remove the over

## Naive Optimization

The purpose of this optimization is to have an idea of how far I can go with tree addition to Random Forest and Extra Trees before running out of memory using their respective default configurations. This will allow me know the number of trees to set in optuna.

In [13]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_cat_train = encoder.fit_transform(X_train[['Snow_Class']],y_train)
X_cat_val = encoder.transform(X_val[['Snow_Class']])


X_train_naive = pd.concat([X_train[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_train], axis=1)
X_val_naive = pd.concat([X_val[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_val], axis=1)

In [14]:
for n_estimator in range(50, 501, 50):

    reg = RandomForestRegressor(
        n_estimators=n_estimator,
        random_state=42,
        n_jobs=-1,
    )

    reg.fit(X_train_naive, y_train)

    # Evaluate the model on the validation set
    y_pred = reg.predict(X_val_naive)
    metric = rmse(y_true=y_val, y_pred=y_pred)

    print(f"n_estimators={n_estimator}, RMSE={metric}")

n_estimators=50, RMSE=0.046943563335131715
n_estimators=100, RMSE=0.04680121866936983
n_estimators=150, RMSE=0.046758008528995476
n_estimators=200, RMSE=0.0467264596041954
n_estimators=250, RMSE=0.046704439602444255
n_estimators=300, RMSE=0.04669342015651978
n_estimators=350, RMSE=0.046689157467007525
n_estimators=400, RMSE=0.046681818054988125
n_estimators=450, RMSE=0.046674048829216046
n_estimators=500, RMSE=0.04666916706196425


In [16]:
for n_estimator in range(50, 501, 50):

    reg = ExtraTreesRegressor(
        n_estimators=n_estimator,
        random_state=42,
        n_jobs=-1,
    )

    reg.fit(X_train_naive, y_train)

    # Evaluate the model on the validation set
    y_pred = reg.predict(X_val_naive)
    metric = rmse(y_true=y_val, y_pred=y_pred)

    print(f"n_estimators={n_estimator}, RMSE={metric}")

n_estimators=50, RMSE=0.052902247257859095
n_estimators=100, RMSE=0.052759299444451756
n_estimators=150, RMSE=0.052684150637674676
n_estimators=200, RMSE=0.0526639749637763
n_estimators=250, RMSE=0.052642302071688206
n_estimators=300, RMSE=0.052634427853188545
n_estimators=350, RMSE=0.05263895213312667
n_estimators=400, RMSE=0.052631978998939524
n_estimators=450, RMSE=0.05263304578955727
n_estimators=500, RMSE=0.052635069851071296


## Comprehensive Tuning with Optuna

* Extra Trees

In [None]:
optimizer = ComprehensiveOptimizer(
    X_train=X_train,
    X_val=X_val,
    y_train=y_train,
    y_val=y_val,
    model_name='extra_trees'  
)

results = optimizer.optimize(n_trials=100)


Starting optimization for extra_trees
Number of trials: 100
Storage: sqlite:///optuna.db

Optimizing extra_trees with onehot encoder...



Optimizing extra_trees with catboost encoder...

Optimizing extra_trees with target encoder...

Optimization Summary
Best configuration: extra_trees_target
Best RMSE: 0.0473
Best R² score: 0.7378


In [7]:
results['extra_trees_target']

{'best_params': {'n_estimators': 120,
  'max_depth': 25,
  'min_samples_split': 14,
  'min_samples_leaf': 1},
 'best_rmse': 0.047268989809986284,
 'best_r2': 0.7378081744757925,
 'study': <optuna.study.study.Study at 0x7108fd3e0d90>}

Based on Optuna (see the dashboard), `max_depth` is the most important hyperparameter. Using this optimal configuration, let's see if increasing `max_depth` give a beter validation RMSE.

In [5]:
encoder = ce.TargetEncoder(cols=['Snow_Class'], min_samples_leaf=20, smoothing=10)

X_cat_train = encoder.fit_transform(X_train['Snow_Class'], y_train)
X_cat_val = encoder.transform(X_val['Snow_Class'])

# Combine with numerical features
X_train_depth = pd.concat([X_train[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_train], axis=1)
X_val_depth = pd.concat([X_val[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_val], axis=1)

In [7]:
model_ext_depth = ExtraTreesRegressor(
    random_state=42, n_estimators=120, 
    min_samples_leaf=1, min_samples_split=14, 
    max_depth=None, # tree grows unrestricted
    n_jobs=-1
)

model_ext_depth.fit(X_train_depth, y_train)

In [12]:
pred_depth = model_ext_depth.predict(X_val_depth)
print(f"RMSE: {rmse(y_true=y_val, y_pred=pred_depth)}\nR²: {r2_score(y_true=y_val, y_pred=pred_depth)}")

RMSE: 0.046216106775682594
R²: 0.749358361434133


* Random Forest

In [None]:
optimizer = ComprehensiveOptimizer(
    X_train=X_train,
    X_val=X_val,
    y_train=y_train,
    y_val=y_val,
    model_name='random_forest' 
)

results = optimizer.optimize(n_trials=100)