## Setup

Load libraries

In [2]:
import pickle
import optuna
import warnings
import pandas as pd
import lightgbm as lgb
import category_encoders as ce
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import root_mean_squared_error as rmse, r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from snowmodels.utils import DefaultTuner, ComprehensiveOptimizer, ClimateOptimizer


set_config(transform_output="pandas")
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [3]:
with open('../data/data_splits.pkl', 'rb') as f:
    data_splits = pickle.load(f)

## Default Hyperparameters

We'll start by running all four models using their default hyperparamaters. This is essential to see if there is any performance gains from tuning.

In [4]:
X_train = data_splits['X_train']
X_val = data_splits['X_val']
y_train = data_splits['y_train']
y_val = data_splits['y_val']

In [7]:
default_tuner = DefaultTuner(
        X_train=X_train,
        X_val=X_val,
        y_train=y_train,
        y_val=y_val
    )

baseline_results = default_tuner.run_default_models()


Running baseline models with default parameters...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 738
[LightGBM] [Info] Number of data points in the train set: 1905792, number of used features: 9
[LightGBM] [Info] Start training from score 0.302447
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	train's rmse: 0.0483899	valid's rmse: 0.0488552

Results with onehot encoder:
LightGBM - R²: 0.7199, RMSE: 0.0489, Best iteration: 1000
XGBoost - R²: 0.7362, RMSE: 0.0474, Best iteration: 999
ExtraTrees - R²: 0.6734, RMSE: 0.0528
RandomForest - R²: 0.7430, RMSE: 0.0468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004291 seconds.
You can set `force_row_wise=true` to remove the over

## Naive Optimization

The purpose of this optimization is to have an idea of how far I can go with tree addition to Random Forest and Extra Trees before running out of memory using their respective default configurations. This will allow me know the number of trees to set in optuna.

In [13]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_cat_train = encoder.fit_transform(X_train[['Snow_Class']],y_train)
X_cat_val = encoder.transform(X_val[['Snow_Class']])


X_train_naive = pd.concat([X_train[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_train], axis=1)
X_val_naive = pd.concat([X_val[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_val], axis=1)

In [14]:
for n_estimator in range(50, 501, 50):

    reg = RandomForestRegressor(
        n_estimators=n_estimator,
        random_state=42,
        n_jobs=-1,
    )

    reg.fit(X_train_naive, y_train)

    # Evaluate the model on the validation set
    y_pred = reg.predict(X_val_naive)
    metric = rmse(y_true=y_val, y_pred=y_pred)

    print(f"n_estimators={n_estimator}, RMSE={metric}")

n_estimators=50, RMSE=0.046943563335131715
n_estimators=100, RMSE=0.04680121866936983
n_estimators=150, RMSE=0.046758008528995476
n_estimators=200, RMSE=0.0467264596041954
n_estimators=250, RMSE=0.046704439602444255
n_estimators=300, RMSE=0.04669342015651978
n_estimators=350, RMSE=0.046689157467007525
n_estimators=400, RMSE=0.046681818054988125
n_estimators=450, RMSE=0.046674048829216046
n_estimators=500, RMSE=0.04666916706196425


In [16]:
for n_estimator in range(50, 501, 50):

    reg = ExtraTreesRegressor(
        n_estimators=n_estimator,
        random_state=42,
        n_jobs=-1,
    )

    reg.fit(X_train_naive, y_train)

    # Evaluate the model on the validation set
    y_pred = reg.predict(X_val_naive)
    metric = rmse(y_true=y_val, y_pred=y_pred)

    print(f"n_estimators={n_estimator}, RMSE={metric}")

n_estimators=50, RMSE=0.052902247257859095
n_estimators=100, RMSE=0.052759299444451756
n_estimators=150, RMSE=0.052684150637674676
n_estimators=200, RMSE=0.0526639749637763
n_estimators=250, RMSE=0.052642302071688206
n_estimators=300, RMSE=0.052634427853188545
n_estimators=350, RMSE=0.05263895213312667
n_estimators=400, RMSE=0.052631978998939524
n_estimators=450, RMSE=0.05263304578955727
n_estimators=500, RMSE=0.052635069851071296


## Comprehensive Tuning with Optuna

* Extra Trees

In [None]:
optimizer = ComprehensiveOptimizer(
    X_train=X_train,
    X_val=X_val,
    y_train=y_train,
    y_val=y_val,
    model_name='extra_trees'  
)

results = optimizer.optimize(n_trials=100)


Starting optimization for extra_trees
Number of trials: 100
Storage: sqlite:///optuna.db

Optimizing extra_trees with onehot encoder...



Optimizing extra_trees with catboost encoder...

Optimizing extra_trees with target encoder...

Optimization Summary
Best configuration: extra_trees_target
Best RMSE: 0.0473
Best R² score: 0.7378


In [7]:
results['extra_trees_target']

{'best_params': {'n_estimators': 120,
  'max_depth': 25,
  'min_samples_split': 14,
  'min_samples_leaf': 1},
 'best_rmse': 0.047268989809986284,
 'best_r2': 0.7378081744757925,
 'study': <optuna.study.study.Study at 0x7108fd3e0d90>}

Based on Optuna (see the dashboard), `max_depth` is the most important hyperparameter. Using this optimal configuration, let's see if increasing `max_depth` give a beter validation RMSE.

In [5]:
encoder = ce.TargetEncoder(cols=['Snow_Class'], min_samples_leaf=20, smoothing=10)

X_cat_train = encoder.fit_transform(X_train['Snow_Class'], y_train)
X_cat_val = encoder.transform(X_val['Snow_Class'])

# Combine with numerical features
X_train_depth = pd.concat([X_train[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_train], axis=1)
X_val_depth = pd.concat([X_val[['Elevation', 'Snow_Depth', 'DOWY']], X_cat_val], axis=1)

In [7]:
model_ext_depth = ExtraTreesRegressor(
    random_state=42, n_estimators=120, 
    min_samples_leaf=1, min_samples_split=14, 
    max_depth=None, # tree grows unrestricted
    n_jobs=-1
)

model_ext_depth.fit(X_train_depth, y_train)

In [12]:
pred_depth = model_ext_depth.predict(X_val_depth)
print(f"RMSE: {rmse(y_true=y_val, y_pred=pred_depth)}\nR²: {r2_score(y_true=y_val, y_pred=pred_depth)}")

RMSE: 0.046216106775682594
R²: 0.749358361434133


* Random Forest

In [5]:
optimizer = ComprehensiveOptimizer(
    X_train=X_train,
    X_val=X_val,
    y_train=y_train,
    y_val=y_val,
    model_name='random_forest' 
)

results = optimizer.optimize(n_trials=100)


Starting optimization for random_forest
Number of trials: 100
Storage: sqlite:///optuna.db

Optimizing random_forest with onehot encoder...

Optimizing random_forest with catboost encoder...

Optimizing random_forest with target encoder...

Optimization Summary
Best configuration: random_forest_target
Best RMSE: 0.0440
Best R² score: 0.7730


In [7]:
results['random_forest_target']

{'best_params': {'n_estimators': 438,
  'max_depth': 25,
  'min_samples_split': 9,
  'min_samples_leaf': 3},
 'best_rmse': 0.043987044176718317,
 'best_r2': 0.7729528451480241,
 'study': <optuna.study.study.Study at 0x791f3d57db50>}

Same behavior was observed for RF - `max_depth` is the most important hyperparameter.

In [9]:
model_ext_depth = RandomForestRegressor(
    random_state=42, n_estimators=438, 
    min_samples_leaf=3, min_samples_split=9, 
    max_depth=None, # tree grows unrestricted
    n_jobs=-1
)

model_ext_depth.fit(X_train_depth, y_train)

In [10]:
pred_depth = model_ext_depth.predict(X_val_depth)
print(f"RMSE: {rmse(y_true=y_val, y_pred=pred_depth)}\nR²: {r2_score(y_true=y_val, y_pred=pred_depth)}")

RMSE: 0.04380915614554091
R²: 0.7747855349131157


* LightGBM

In [5]:
optimizer = ComprehensiveOptimizer(
    X_train=X_train,
    X_val=X_val,
    y_train=y_train,
    y_val=y_val,
    model_name='lightgbm' 
)

results = optimizer.optimize(n_trials=100)


Starting optimization for lightgbm
Number of trials: 100
Storage: sqlite:///optuna.db

Optimizing lightgbm with onehot encoder...


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 roundsTraining until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds

Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validatio


Optimizing lightgbm with catboost encoder...


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validatio


Optimizing lightgbm with target encoder...


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validatio


Optimization Summary
Best configuration: lightgbm_target
Best RMSE: 0.0435
Best R² score: 0.7776


In [None]:
results['lightgbm_target']['best_params']

{'learning_rate': 0.13597606223994746,
 'num_leaves': 185,
 'min_child_weight': 20.535024680037342,
 'min_child_samples': 22,
 'max_bin': 764,
 'max_depth': 20}

* XGBoost

In [5]:
optimizer = ComprehensiveOptimizer(
    X_train=X_train,
    X_val=X_val,
    y_train=y_train,
    y_val=y_val,
    model_name='xgboost' 
)

results = optimizer.optimize(n_trials=100)


Starting optimization for xgboost
Number of trials: 100
Storage: sqlite:///optuna.db

Optimizing xgboost with onehot encoder...



Optimizing xgboost with catboost encoder...

Optimizing xgboost with target encoder...

Optimization Summary
Best configuration: xgboost_target
Best RMSE: 0.0430
Best R² score: 0.7829


XGBoost slightly outperformed other regressors and target encoding yielded better results.

In [None]:
results['xgboost_target']['best_params']

{'gamma': 1.8331574770840668e-06,
 'learning_rate': 0.012571635462866944,
 'max_bin': 3350,
 'max_depth': 17,
 'min_child_weight': 12.338531236617445}

## Adding Climatology

Which lag to include?

In [13]:
(
    X_train
    .assign(Snow_Density=y_train)
    .filter(items=['PRECIPITATION_lag_7d', 'PRECIPITATION_lag_14d', 'TAVG_lag_7d', 'TAVG_lag_14d', 'Snow_Density'])
    .corr()
    .round(3)
)

Unnamed: 0,PRECIPITATION_lag_7d,PRECIPITATION_lag_14d,TAVG_lag_7d,TAVG_lag_14d,Snow_Density
PRECIPITATION_lag_7d,1.0,0.835,0.009,0.054,0.015
PRECIPITATION_lag_14d,0.835,1.0,0.033,0.056,0.084
TAVG_lag_7d,0.009,0.033,1.0,0.921,0.606
TAVG_lag_14d,0.054,0.056,0.921,1.0,0.596
Snow_Density,0.015,0.084,0.606,0.596,1.0


Precipitation at lag 14 has a higher correlation with $\rho_s$ compared to lag 7. In terms of temperature, they are identical. Let's check the performance on a tuning set. Extra Trees is faster to train, so, we'll experiment with Extra Trees.

In [21]:
train_data_clim = (
    X_train
    .filter(items=['Elevation', 'Snow_Depth', 'DOWY', 'TAVG_lag_7d', 'TAVG_lag_14d', 'PRECIPITATION_lag_7d', 'PRECIPITATION_lag_14d', 'Snow_Class'])
    .assign(Snow_Density=y_train)
)

val_data_clim = (
    X_val
    .filter(items=['Elevation', 'Snow_Depth', 'DOWY', 'TAVG_lag_7d', 'TAVG_lag_14d', 'PRECIPITATION_lag_7d', 'PRECIPITATION_lag_14d', 'Snow_Class'])
    .assign(Snow_Density=y_val)
)

val_data_clim.head()

Unnamed: 0,Elevation,Snow_Depth,DOWY,TAVG_lag_7d,TAVG_lag_14d,PRECIPITATION_lag_7d,PRECIPITATION_lag_14d,Snow_Class,Snow_Density
6661564,2072.64,129.54,189,-16.626984,-17.051587,0.399143,0.290286,Alpine,0.380392
541222,3341.8272,144.78,143,-22.539683,-22.876984,0.580571,0.616857,Tundra,0.273684
4377228,2809.9512,137.16,161,-19.595238,-20.142857,0.508,0.399143,Alpine,0.261111
3609603,1569.72,30.48,87,-20.309524,-20.039683,0.580571,0.453571,Alpine,0.341667
5943233,1688.592,127.0,195,-17.277778,-17.222222,0.435429,0.362857,Alpine,0.412


In [33]:
train_data_clim_lag7 = train_data_clim.drop(columns=['PRECIPITATION_lag_14d', 'TAVG_lag_14d']).dropna()
train_data_clim_lag14 = train_data_clim.drop(columns=['PRECIPITATION_lag_7d', 'TAVG_lag_7d']).dropna()

val_data_clim_lag7 = val_data_clim.drop(columns=['PRECIPITATION_lag_14d', 'TAVG_lag_14d']).dropna()
val_data_clim_lag14 = val_data_clim.drop(columns=['PRECIPITATION_lag_7d', 'TAVG_lag_7d']).dropna()

encoder7d = ce.TargetEncoder(cols=['Snow_Class'], min_samples_leaf=20, smoothing=10)
encoder14d = ce.TargetEncoder(cols=['Snow_Class'], min_samples_leaf=20, smoothing=10)


X_cat_train7d = encoder7d.fit_transform(train_data_clim_lag7['Snow_Class'], train_data_clim_lag7['Snow_Density'])
X_cat_val7d = encoder7d.transform(val_data_clim_lag7['Snow_Class'])


X_cat_train14d = encoder14d.fit_transform(train_data_clim_lag14['Snow_Class'], train_data_clim_lag14['Snow_Density'])
X_cat_val14d = encoder14d.transform(val_data_clim_lag14['Snow_Class'])


# Combine with numerical features
X_train_7d = pd.concat([train_data_clim_lag7[['Elevation', 'Snow_Depth', 'DOWY', 'PRECIPITATION_lag_7d', 'TAVG_lag_7d']], X_cat_train7d], axis=1)
X_val_7d = pd.concat([val_data_clim_lag7[['Elevation', 'Snow_Depth', 'DOWY', 'PRECIPITATION_lag_7d', 'TAVG_lag_7d']], X_cat_val7d], axis=1)

X_train_14d = pd.concat([train_data_clim_lag14[['Elevation', 'Snow_Depth', 'DOWY', 'PRECIPITATION_lag_14d', 'TAVG_lag_14d']], X_cat_train14d], axis=1)
X_val_14d = pd.concat([val_data_clim_lag14[['Elevation', 'Snow_Depth', 'DOWY', 'PRECIPITATION_lag_14d', 'TAVG_lag_14d']], X_cat_val14d], axis=1)

In [37]:
model_ext_7d = ExtraTreesRegressor(
    random_state=42, n_estimators=120, 
    min_samples_leaf=1, min_samples_split=14, 
    max_depth=None, # tree grows unrestricted
    n_jobs=-1
)

model_ext_7d.fit(X_train_7d, train_data_clim_lag7['Snow_Density'])

In [38]:
pred_7 = model_ext_7d.predict(X_val_7d)
print(f"RMSE: {rmse(y_true=val_data_clim_lag7['Snow_Density'], y_pred=pred_7)}\nR²: {r2_score(y_true=val_data_clim_lag7['Snow_Density'], y_pred=pred_7)}")

RMSE: 0.04337517076647857
R²: 0.778793470554547


In [39]:
model_ext_14d = ExtraTreesRegressor(
    random_state=42, n_estimators=120, 
    min_samples_leaf=1, min_samples_split=14, 
    max_depth=None, # tree grows unrestricted
    n_jobs=-1
)

model_ext_14d.fit(X_train_14d, train_data_clim_lag14['Snow_Density'])

In [40]:
pred_14 = model_ext_14d.predict(X_val_14d)
print(f"RMSE: {rmse(y_true=val_data_clim_lag14['Snow_Density'], y_pred=pred_14)}\nR²: {r2_score(y_true=val_data_clim_lag14['Snow_Density'], y_pred=pred_14)}")

RMSE: 0.04192081734541113
R²: 0.7933922268812132


 Lag 14 is better!. We'll tune using Lag 14!

In [5]:
train_data_clim_14days = (
    X_train
    .filter(items=['Elevation', 'Snow_Depth', 'DOWY', 'TAVG_lag_14d', 'PRECIPITATION_lag_14d', 'Snow_Class'])
    .assign(Snow_Density=y_train)
    .dropna(subset = ['TAVG_lag_14d', 'PRECIPITATION_lag_14d'])
)

val_data_clim_14days = (
    X_val
    .filter(items=['Elevation', 'Snow_Depth', 'DOWY', 'TAVG_lag_14d', 'PRECIPITATION_lag_14d', 'Snow_Class'])
    .assign(Snow_Density=y_val)
    .dropna(subset = ['TAVG_lag_14d', 'PRECIPITATION_lag_14d'])
)

* Extra Trees

In [6]:
optimizer = ClimateOptimizer(
    X_train=train_data_clim_14days,
    X_val=val_data_clim_14days,
    y_train=train_data_clim_14days.Snow_Density,
    y_val=val_data_clim_14days.Snow_Density,
    model_name='extra_trees'
)

optimizer.optimize(n_trials=100)


Starting climate model optimization for extra_trees
Number of trials: 100
Storage: sqlite:///optuna.db

Climate Model Optimization Summary
Best RMSE: 0.0422
Best R² score: 0.7901

Best parameters:
  n_estimators: 383
  max_depth: 25
  min_samples_split: 5
  min_samples_leaf: 1


{'best_params': {'n_estimators': 383,
  'max_depth': 25,
  'min_samples_split': 5,
  'min_samples_leaf': 1},
 'best_rmse': 0.04224962596576943,
 'best_r2': 0.7901384334138297,
 'study': <optuna.study.study.Study at 0x76695c523f10>}

* Default Tuner ExtraTrees

In [7]:
encoder14d = ce.TargetEncoder(cols=['Snow_Class'], min_samples_leaf=20, smoothing=10)

X_cat_train14d = encoder14d.fit_transform(train_data_clim_14days['Snow_Class'], train_data_clim_14days['Snow_Density'])
X_cat_val14d = encoder14d.transform(val_data_clim_14days['Snow_Class'])

X_train_14d = pd.concat([train_data_clim_14days[['Elevation', 'Snow_Depth', 'DOWY', 'PRECIPITATION_lag_14d', 'TAVG_lag_14d']], X_cat_train14d], axis=1)
X_val_14d = pd.concat([val_data_clim_14days[['Elevation', 'Snow_Depth', 'DOWY', 'PRECIPITATION_lag_14d', 'TAVG_lag_14d']], X_cat_val14d], axis=1)

In [None]:
model_ext_default = ExtraTreesRegressor(random_state=42, n_jobs=-1)

model_ext_default.fit(X_train_14d, train_data_clim_14days['Snow_Density'])

In [9]:
pred_ext_def = model_ext_default.predict(X_val_14d)
print(f"RMSE: {rmse(y_true=val_data_clim_14days['Snow_Density'], y_pred=pred_ext_def)}\nR²: {r2_score(y_true=val_data_clim_14days['Snow_Density'], y_pred=pred_ext_def)}")

RMSE: 0.038773468469387715
R²: 0.8232511982090762


* Default + Optuna (i.e., setting max depth to None based on Optuna's importance)

In [14]:
model_ext_default_opt = ExtraTreesRegressor(random_state=42, n_estimators=383, min_samples_split=5, min_samples_leaf=1, n_jobs=-1)

model_ext_default_opt.fit(X_train_14d, train_data_clim_14days['Snow_Density'])

In [15]:
pred_ext_def = model_ext_default_opt.predict(X_val_14d)
print(f"RMSE: {rmse(y_true=val_data_clim_14days['Snow_Density'], y_pred=pred_ext_def)}\nR²: {r2_score(y_true=val_data_clim_14days['Snow_Density'], y_pred=pred_ext_def)}")

RMSE: 0.03924376820303675
R²: 0.8189374730635413


 Default wins!

* Random Forest

In [6]:
optimizer = ClimateOptimizer(
    X_train=train_data_clim_14days,
    X_val=val_data_clim_14days,
    y_train=train_data_clim_14days.Snow_Density,
    y_val=val_data_clim_14days.Snow_Density,
    model_name='random_forest'
)

optimizer.optimize(n_trials=100)


Starting climate model optimization for random_forest
Number of trials: 100
Storage: sqlite:///optuna.db

Climate Model Optimization Summary
Best RMSE: 0.0391
Best R² score: 0.8203

Best parameters:
  n_estimators: 412
  max_depth: 25
  min_samples_split: 2
  min_samples_leaf: 1


{'best_params': {'n_estimators': 412,
  'max_depth': 25,
  'min_samples_split': 2,
  'min_samples_leaf': 1},
 'best_rmse': 0.03909290573790727,
 'best_r2': 0.8203268929210055,
 'study': <optuna.study.study.Study at 0x7b8d6c1fb9d0>}

* Default Tuner Random Forests

In [8]:
model_rf_default = RandomForestRegressor(random_state=42, n_jobs=-1)

model_rf_default.fit(X_train_14d, train_data_clim_14days['Snow_Density'])

In [9]:
pred_rf_def = model_rf_default.predict(X_val_14d)
print(f"RMSE: {rmse(y_true=val_data_clim_14days['Snow_Density'], y_pred=pred_rf_def)}\nR²: {r2_score(y_true=val_data_clim_14days['Snow_Density'], y_pred=pred_rf_def)}")

RMSE: 0.03724839291144764
R²: 0.836881862862678


* Default + Optuna (i.e., setting max depth to None based on Optuna's importance)

In [10]:
model_rf_default_opt = RandomForestRegressor(random_state=42, n_estimators=412, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)

model_rf_default_opt.fit(X_train_14d, train_data_clim_14days['Snow_Density'])

In [11]:
pred_rf_def = model_rf_default_opt.predict(X_val_14d)
print(f"RMSE: {rmse(y_true=val_data_clim_14days['Snow_Density'], y_pred=pred_rf_def)}\nR²: {r2_score(y_true=val_data_clim_14days['Snow_Density'], y_pred=pred_rf_def)}")

RMSE: 0.03706275895226336
R²: 0.8385036677094896


* LightGBM

In [6]:
optimizer = ClimateOptimizer(
    X_train=train_data_clim_14days,
    X_val=val_data_clim_14days,
    y_train=train_data_clim_14days.Snow_Density,
    y_val=val_data_clim_14days.Snow_Density,
    model_name='lightgbm'
)

optimizer.optimize(n_trials=100)


Starting climate model optimization for lightgbm
Number of trials: 100
Storage: sqlite:///optuna.db


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Training until validatio


Climate Model Optimization Summary
Best RMSE: 0.0383
Best R² score: 0.8278
Best iteration: 1500

Best parameters:
  max_depth: 15
  learning_rate: 0.28206964529038514
  num_leaves: 166
  min_child_weight: 29.631721026875745
  min_child_samples: 25
  max_bin: 3128


{'best_params': {'max_depth': 15,
  'learning_rate': 0.28206964529038514,
  'num_leaves': 166,
  'min_child_weight': 29.631721026875745,
  'min_child_samples': 25,
  'max_bin': 3128},
 'best_rmse': 0.03826655563297045,
 'best_r2': 0.8278425109103131,
 'study': <optuna.study.study.Study at 0x752251d7f850>,
 'best_iteration': 1500}

* XGBoost

In [6]:
optimizer = ClimateOptimizer(
    X_train=train_data_clim_14days,
    X_val=val_data_clim_14days,
    y_train=train_data_clim_14days.Snow_Density,
    y_val=val_data_clim_14days.Snow_Density,
    model_name='xgboost'
)

optimizer.optimize(n_trials=100)


Starting climate model optimization for xgboost
Number of trials: 100
Storage: sqlite:///optuna.db

Climate Model Optimization Summary
Best RMSE: 0.0332
Best R² score: 0.8707
Best iteration: 1498

Best parameters:
  max_depth: 18
  learning_rate: 0.08039528136679829
  min_child_weight: 2.8200619931422226
  max_bin: 5817
  gamma: 7.210110116555584e-07


{'best_params': {'max_depth': 18,
  'learning_rate': 0.08039528136679829,
  'min_child_weight': 2.8200619931422226,
  'max_bin': 5817,
  'gamma': 7.210110116555584e-07},
 'best_rmse': 0.03316345521393791,
 'best_r2': 0.8706975607008156,
 'study': <optuna.study.study.Study at 0x73eea6727e50>,
 'best_iteration': 1498}