In [1]:
!pip install optuna

Defaulting to user installation because normal site-packages is not writeable


In [26]:
import numpy as np
import pandas as pd
import xgboost

In [27]:
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import mean_squared_error
logo = LeaveOneGroupOut()

In [35]:
import optuna

In [24]:
X = pd.read_csv('./processed_data/X.csv')
y = pd.read_csv('./processed_data/y.csv').values.squeeze()
year_factor = pd.read_csv('./processed_data/year_factor.csv').values.squeeze()

In [91]:
def objective(trial):

    valid_metrics = []
    
    n_estimators = trial.suggest_int('n_estimators', 10, 1000, log=True)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    max_delta_step = trial.suggest_int('max_delta_step', 0, 2)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    gamma = trial.suggest_float('gamma', 0, 100)
    min_child_weight = trial.suggest_float('min_child_weight', 0, 100)
    subsample = trial.suggest_float('subsample', 0, 1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0, 1)
    colsample_bylevel = trial.suggest_float('colsample_bylevel', 0, 1)
    colsample_bynode = trial.suggest_float('colsample_bynode', 0, 1)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-6, 1, log=True)
    reg_lambda = trial.suggest_float('reg_lambda', 1e-6, 1, log=True)
    scale_pos_weight = trial.suggest_float('scale_pos_weight', 0, 2)

    for train_index, valid_index in logo.split(X, y, year_factor):
        X_train = X.iloc[train_index]
        y_train = y[train_index]
        X_valid = X.iloc[valid_index]
        y_valid = y[valid_index]
        
        #pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-rmse")

        rfr = xgboost.XGBRegressor(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   learning_rate=learning_rate,
                                   gamma=gamma,
                                   min_child_weight=min_child_weight,
                                   max_delta_step=max_delta_step,
                                   subsample=subsample,
                                   colsample_bytree=colsample_bytree,
                                   colsample_bylevel=colsample_bylevel,
                                   colsample_bynode=colsample_bynode,
                                   reg_alpha=reg_alpha,
                                   reg_lambda=reg_lambda,
                                   scale_pos_weight=scale_pos_weight,
                                   random_state=700)
        
        rfr = rfr.fit(X_train, y_train, 
                      eval_set=[(X_valid, y_valid)], 
                      early_stopping_rounds=10, 
                      callbacks=[],
                      verbose=0)
    
        #print('*'*40)
        #print('Evaluating on year_factor', pd.Series(year_factor[valid_index]).unique())
        #print(f'Train: {len(y_train)}, Valid: {len(y_valid)}')
        #print('R2:   ', rfr.score(X_valid, y_valid))
    
        y_valid_hat = rfr.predict(X_valid)
    #if transform_labels:
    #    y_valid = qtrafo.inverse_transform(y_valid.reshape(-1, 1)).squeeze()
    #    y_valid_hat = qtrafo.inverse_transform(y_valid_hat.reshape(-1, 1)).squeeze() 
        score = rfr.score(X_valid, y_valid)
        rmse = mean_squared_error(y_valid, y_valid_hat, squared=False)
        #print('RMSE: ', rmse)
    
        valid_metrics.append(score)
        
    return np.mean(valid_metrics)

In [92]:
study = optuna.create_study(direction='maximize') #pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)

[32m[I 2022-01-25 15:48:50,226][0m A new study created in memory with name: no-name-e3b500a6-b539-4782-8b81-24c26077bb09[0m


In [93]:
study.optimize(objective, n_trials=100, timeout=5400)

[32m[I 2022-01-25 15:48:54,560][0m Trial 0 finished with value: -1.530671365567067 and parameters: {'n_estimators': 20, 'max_depth': 15, 'max_delta_step': 0, 'learning_rate': 0.006788983980943314, 'gamma': 31.060515621671225, 'min_child_weight': 45.49909425604896, 'subsample': 0.24004085871430125, 'colsample_bytree': 0.03811800292752776, 'colsample_bylevel': 0.82394372595653, 'colsample_bynode': 0.4397186368296735, 'reg_alpha': 1.4339047662415767e-05, 'reg_lambda': 2.6102692864246574e-06, 'scale_pos_weight': 0.3712054393670601}. Best is trial 0 with value: -1.530671365567067.[0m
[32m[I 2022-01-25 15:48:55,850][0m Trial 1 finished with value: -2.0031623500066873 and parameters: {'n_estimators': 25, 'max_depth': 7, 'max_delta_step': 2, 'learning_rate': 0.00016156473184367236, 'gamma': 14.456204463106836, 'min_child_weight': 88.01426633907226, 'subsample': 0.6593477819926805, 'colsample_bytree': 0.10517179028428969, 'colsample_bylevel': 0.6614908766382505, 'colsample_bynode': 0.31244

KeyboardInterrupt: 

In [83]:
df = study.trials_dataframe().drop(columns=['number', 'datetime_start', 'datetime_complete', 'duration'])
df

Unnamed: 0,value,params_colsample_bylevel,params_colsample_bynode,params_colsample_bytree,params_gamma,params_learning_rate,params_max_delta_step,params_max_depth,params_min_child_weight,params_n_estimators,params_reg_alpha,params_reg_lambda,params_scale_pos_weight,params_subsample,state
0,104.459135,0.93586,0.09864,0.377982,23.858322,0.003134,2,18,13.982098,30,0.271786,0.501009,1.15078,0.116786,COMPLETE
1,103.754082,0.058646,0.385842,0.384966,68.0843,0.007663,1,14,98.158616,138,0.005274,3.5e-05,0.517886,0.381295,COMPLETE
2,103.918561,0.393856,0.176184,0.448321,64.064623,0.000172,0,13,63.144352,62,0.002184,0.089606,1.737881,0.130752,COMPLETE
3,97.897081,0.81051,0.973947,0.968225,11.214054,0.004709,2,17,65.203749,893,0.000201,0.001697,1.025317,0.425041,COMPLETE
4,102.952372,0.983119,0.168047,0.16409,93.850167,0.001483,2,17,37.600636,691,0.000392,0.000456,0.759146,0.013755,COMPLETE
5,104.033263,0.214622,0.942234,0.389488,44.904963,0.000669,2,4,16.182684,533,1.2e-05,0.139531,0.366087,0.384604,COMPLETE
6,90.485934,0.865399,0.121509,0.840085,23.100786,0.001843,0,17,59.439288,134,2e-06,2e-06,1.270288,0.193466,COMPLETE
7,86.782144,0.356532,0.035264,0.547122,21.50711,0.000819,0,3,33.829321,401,0.00028,0.742642,0.8797,0.996737,COMPLETE
8,104.606517,0.782011,0.439609,0.402621,73.61321,0.000418,1,2,39.40325,16,0.050336,7e-06,0.186831,0.555574,COMPLETE
9,104.596373,0.681472,0.263088,0.011475,5.921433,0.000109,1,13,59.84759,176,0.003526,0.460847,1.679323,0.049155,COMPLETE


In [85]:
df.corr()['value'].sort_values()

params_subsample           -0.437734
params_learning_rate       -0.324363
params_n_estimators        -0.270546
params_colsample_bynode    -0.179022
params_scale_pos_weight    -0.053319
params_colsample_bylevel   -0.039196
params_gamma                0.021523
params_reg_lambda           0.144390
params_colsample_bytree     0.178293
params_reg_alpha            0.233974
params_min_child_weight     0.237860
params_max_depth            0.482211
params_max_delta_step       0.568163
value                       1.000000
Name: value, dtype: float64

In [86]:
study.best_params

{'n_estimators': 585,
 'max_depth': 3,
 'max_delta_step': 0,
 'learning_rate': 0.009220710129045235,
 'gamma': 55.10759374423698,
 'min_child_weight': 28.15446498358142,
 'subsample': 0.6120336294212528,
 'colsample_bytree': 0.19464631486746037,
 'colsample_bylevel': 0.6222000833075367,
 'colsample_bynode': 0.7403968045484066,
 'reg_alpha': 1.2614523422422438e-06,
 'reg_lambda': 0.00020829275549139398,
 'scale_pos_weight': 1.074168742869225}