In [1]:
%matplotlib inline


# 03 Gradient Boosting regression


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

#from scipy.stats import uniform, randint
import pandas as pd
#import seaborn as sns

pd.set_option('display.max_columns', 30)
pd.set_option('display.precision', 3)


## Time-independent regression with `df_X_pca` and `df_y`




In [None]:
df_X_pca = pd.read_pickle('df_X_pca.pkl')
X_cols = df_X_pca.columns

df_y = pd.read_pickle('df_y.pkl')
y_cols = df_y.columns

X = df_X_pca.to_numpy()
y = df_y.to_numpy()

print('X.shape = ', X.shape, ', y.shape = ', y.shape)

X.shape =  (1229, 50) , y.shape =  (1229, 6)


### Train-val-test split

Even when attempting time-independent regression, we cannot shuffle the data set before making train and test plit. If the model should be able to predict output variables ahead in time, it has to work well on a chronologically split data set.

For XGBoost we use a sperarate (random) part of the training set as evaluation set.


In [None]:
split_test = int(len(X)*0.8)
X_train, X_test = X[:split_test,:], X[split_test+1:,:]
y_train, y_test = y[:split_test,:], y[split_test+1:,:]

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
print('X_train:', X_train.shape, 'X_val:', X_val.shape, 'X_test:', X_test.shape)
print('y_train:', y_train.shape, 'y_val:', y_val.shape, 'y_test:', y_test.shape)

Note = 'PCA50. Random train/val but end as test.'

X_train: (737, 50) X_val: (246, 50) X_test: (245, 50)
y_train: (737, 6) y_val: (246, 6) y_test: (245, 6)


In [None]:
import xgboost as xgb

params = {
    "objective": "reg:squarederror",
    "random_state": 42,
    "subsample": 0.5, # subsample ratio of columns when constructing each tree.
    'colsample_bytree': 0.5,
    #'alpha': 0.5757147587224918,
    #'lambda': 0.6751025760853637,
    #'gamma': 0.004714915573102214,
    #'learning_rate': 0.18832358744106376,
    #'max_depth': 5,
    #'n_estimators': 269
    "alpha": 0, # L1 regularization term on weights. Increasing this value will make model more conservative.
    "lambda": 1, # L2 regularization term on weights. Increasing this value will make model more conservative.
    "gamma": 0.01, # Minimum loss reduction required to make a further partition on a leaf node of the tree. Higher -> more conservative. [0,inf]
    "learning_rate": 0.04, # default 0.3 
    "max_depth": 5, # default 3
    "n_estimators": 400, # default 100
    #"verbose_eval": True# Chose wether to print output during training (1) or no printing (0)
}


##Optimal parameter set so far:
#

### Fit regression model


In [None]:
# Prepare dataframe of rmse results and parameter values, or load existing dataframe from file if `fit_file`exists

fit_file = '04_fitting_results.pkl' # NB: Update name according to notebook number

if os.path.exists(fit_file):
    fit_results = pd.read_pickle(fit_file)
else:
    res_cols = ['Note']
    for col in df_y.columns:
        res_cols.append(col+' RMSE')
        res_cols.append(col+' R2')
    res_cols.append('R2 mean')
    for key in params.keys():
        res_cols.append(key)
    res_cols.append('Training time')
        
    fit_results = pd.DataFrame(columns=res_cols)


start_time = datetime.now()


# ----------------- Calculate root mean squared error of predictions and -------------------------
# ----------------- store results in dataframe together with parameter values used -------------------------
res_labels = ['Note']
res_values = [Note]
r2_values = []

regs = []
ests = []
for i in range(y_train.shape[1]):
    reg = xgb.XGBRegressor(**params) 
    reg.set_params(early_stopping_rounds=20)
    history = reg.fit(X_train, y_train[:,i], eval_set=[(X_val, y_val[:,i])]) # Fit model for label i
    regs.append(reg)

    y_test_estimated = reg.predict(X_test) # Predict label values on test set
    ests.append(y_test_estimated)
    rmse = sqrt(mse(y_test[:,i], y_test_estimated))
    res_values.append(rmse)
    label = y_cols[i]
    res_labels.append(label+' RMSE')
    
    res_labels.append(label+' R2')
    r2 = reg.score(X_test, y_test[:,i])
    res_values.append(r2)
    r2_values.append(r2)

res_labels.append('R2 mean')
res_values.append(np.mean(r2_values))

for key in params.keys():
    res_values.append(params[key])
    res_labels.append(key)

training_time = (datetime.now() - start_time).total_seconds()
res_values.append(training_time)
res_labels.append('Training time')

df_tmp = pd.DataFrame([res_values], columns=res_labels, index=[datetime.now().date()])
fit_results = fit_results.append(df_tmp)
#fit_results = fit_results.sort_values('R2', ascending=False)
fit_results.to_pickle(fit_file)
fit_results

[0]	validation_0-rmse:1.11876
[1]	validation_0-rmse:1.11254
[2]	validation_0-rmse:1.10540
[3]	validation_0-rmse:1.09954
[4]	validation_0-rmse:1.09235
[5]	validation_0-rmse:1.08651
[6]	validation_0-rmse:1.07919
[7]	validation_0-rmse:1.07536
[8]	validation_0-rmse:1.07102
[9]	validation_0-rmse:1.06352
[10]	validation_0-rmse:1.05701
[11]	validation_0-rmse:1.05451
[12]	validation_0-rmse:1.05418
[13]	validation_0-rmse:1.05297
[14]	validation_0-rmse:1.04930
[15]	validation_0-rmse:1.04560
[16]	validation_0-rmse:1.04383
[17]	validation_0-rmse:1.04124
[18]	validation_0-rmse:1.03879
[19]	validation_0-rmse:1.03567
[20]	validation_0-rmse:1.03300
[21]	validation_0-rmse:1.03181
[22]	validation_0-rmse:1.03168
[23]	validation_0-rmse:1.03053
[24]	validation_0-rmse:1.02825
[25]	validation_0-rmse:1.02645
[26]	validation_0-rmse:1.02419
[27]	validation_0-rmse:1.02408
[28]	validation_0-rmse:1.02307
[29]	validation_0-rmse:1.02153
[30]	validation_0-rmse:1.01879
[31]	validation_0-rmse:1.01840
[32]	validation_0-

  fit_results = fit_results.append(df_tmp)


Unnamed: 0,Note,y1 RMSE,y1 R2,y2 RMSE,y2 R2,y3 RMSE,y3 R2,y4 RMSE,y4 R2,y5 RMSE,y5 R2,y6 RMSE,y6 R2,R2 mean,objective,random_state,subsample,colsample_bytree,alpha,lambda,gamma,learning_rate,max_depth,n_estimators,Training time
2023-02-16,PCA50. Random train/val but end as test.,0.722021,-0.754995,0.178265,-0.037989,0.110367,-1.170827,1.061566,0.359362,0.734174,0.209147,0.811528,0.017129,-0.229695,reg:squarederror,42,0.5,0.5,0,1,0.01,0.04,5,400,14.868702


## Time dependent regression with `df_X_lagged` and `df_y_lagged`

The reason for not using `df_y` here is that `df_X_lagged` contains fewer rows and we need a coherent set of labels



In [None]:
df_X_lagged = pd.read_pickle('df_X_lagged_full.pkl')
X_cols = df_X_pca.columns

df_y_lagged = pd.read_pickle('df_y_lagged_full.pkl')
y_cols = df_y.columns

X = df_X_lagged.to_numpy()
y = df_y_lagged.to_numpy()

print('X.shape = ', X.shape, ', y.shape = ', y.shape)

X.shape =  (1226, 330) , y.shape =  (1226, 6)


### Train-val-test split


In [None]:
split_test = int(len(X)*0.8)
X_train, X_test = X[:split_test,:], X[split_test+1:,:]
y_train, y_test = y[:split_test,:], y[split_test+1:,:]

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
print('X_train:', X_train.shape, 'X_val:', X_val.shape, 'X_test:', X_test.shape)
print('y_train:', y_train.shape, 'y_val:', y_val.shape, 'y_test:', y_test.shape)

Note = 'df_X_lagged_full. Random train/val but end as test.'

X_train: (735, 330) X_val: (245, 330) X_test: (245, 330)
y_train: (735, 6) y_val: (245, 6) y_test: (245, 6)


### Fit regression model


In [None]:
# Prepare dataframe of rmse results and parameter values, or load existing dataframe from file if `fit_file`exists

fit_file = '04_fitting_results.pkl' # NB: Update name according to notebook number

if os.path.exists(fit_file):
    fit_results = pd.read_pickle(fit_file)
else:
    res_cols = ['Note']
    for col in df_y_lagged.columns:
        res_cols.append(col+' RMSE')
        res_cols.append(col+' R2')
    res_cols.append('R2 mean')
    for key in params.keys():
        res_cols.append(key)
    res_cols.append('Training time')
        
    fit_results = pd.DataFrame(columns=res_cols)


start_time = datetime.now()


# ----------------- Calculate root mean squared error of predictions and -------------------------
# ----------------- store results in dataframe together with parameter values used -------------------------
res_labels = ['Note']
res_values = [Note]
r2_values = []

regs = []
ests = []
for i in range(y_train.shape[1]):
    reg = xgb.XGBRegressor(**params) 
    reg.set_params(early_stopping_rounds=20)
    history = reg.fit(X_train, y_train[:,i], eval_set=[(X_val, y_val[:,i])]) # Fit model for label i
    regs.append(reg)

    y_test_estimated = reg.predict(X_test) # Predict label values on test set
    ests.append(y_test_estimated)
    rmse = sqrt(mse(y_test[:,i], y_test_estimated))
    res_values.append(rmse)
    label = y_cols[i]
    res_labels.append(label+' RMSE')
    
    res_labels.append(label+' R2')
    r2 = reg.score(X_test, y_test[:,i])
    res_values.append(r2)
    r2_values.append(r2)

res_labels.append('R2 mean')
res_values.append(np.mean(r2_values))

for key in params.keys():
    res_values.append(params[key])
    res_labels.append(key)

training_time = (datetime.now() - start_time).total_seconds()
res_values.append(training_time)
res_labels.append('Training time')

df_tmp = pd.DataFrame([res_values], columns=res_labels, index=[datetime.now().date()])
fit_results = fit_results.append(df_tmp)
#fit_results = fit_results.sort_values('R2', ascending=False)
fit_results.to_pickle(fit_file)
fit_results

[0]	validation_0-rmse:1.12593
[1]	validation_0-rmse:1.11610
[2]	validation_0-rmse:1.10523
[3]	validation_0-rmse:1.09329
[4]	validation_0-rmse:1.07974
[5]	validation_0-rmse:1.07122
[6]	validation_0-rmse:1.06128
[7]	validation_0-rmse:1.05259
[8]	validation_0-rmse:1.04782
[9]	validation_0-rmse:1.03679
[10]	validation_0-rmse:1.02684
[11]	validation_0-rmse:1.01939
[12]	validation_0-rmse:1.01284
[13]	validation_0-rmse:1.00741
[14]	validation_0-rmse:1.00196
[15]	validation_0-rmse:0.99755
[16]	validation_0-rmse:0.99131
[17]	validation_0-rmse:0.98561
[18]	validation_0-rmse:0.98472
[19]	validation_0-rmse:0.97794
[20]	validation_0-rmse:0.97346
[21]	validation_0-rmse:0.96815
[22]	validation_0-rmse:0.96459
[23]	validation_0-rmse:0.96000
[24]	validation_0-rmse:0.95738
[25]	validation_0-rmse:0.95648
[26]	validation_0-rmse:0.95231
[27]	validation_0-rmse:0.94643
[28]	validation_0-rmse:0.94305
[29]	validation_0-rmse:0.94131
[30]	validation_0-rmse:0.94040
[31]	validation_0-rmse:0.93695
[32]	validation_0-

  fit_results = fit_results.append(df_tmp)


Unnamed: 0,Note,y1 RMSE,y1 R2,y2 RMSE,y2 R2,y3 RMSE,y3 R2,y4 RMSE,y4 R2,y5 RMSE,y5 R2,y6 RMSE,y6 R2,R2 mean,objective,random_state,subsample,colsample_bytree,alpha,lambda,gamma,learning_rate,max_depth,n_estimators,Training time
2023-02-16,PCA50. Random train/val but end as test.,0.722,-0.755,0.178,-0.038,0.11,-1.171,1.062,0.359,0.734,0.209,0.812,0.017,-0.23,reg:squarederror,42,0.5,0.5,0,1,0.01,0.04,5,400,14.869
2023-02-16,PCA50. Random train/val but end as test.,0.464,-0.079,0.156,0.163,0.084,-0.195,1.002,0.39,0.591,0.455,0.671,0.314,0.175,reg:squarederror,42,0.5,0.5,0,1,0.01,0.04,5,400,27.898
2023-02-16,df_X_lagged_full. Random train/val but end as ...,0.533,0.042,0.159,0.175,0.076,-0.042,1.081,0.335,0.635,0.408,0.696,0.277,0.199,reg:squarederror,42,0.5,0.5,0,1,0.01,0.04,5,400,33.143
