In [1]:
import pandas as pd
import dateutil
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import joblib
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
date_parser = lambda x: dateutil.parser.parse(x, ignoretz=True)

In [3]:
def RF_model():
    param_grid = [{'n_estimators': [5, 10, 20, 40], 'max_features': [2, 4, 6, 8]},
                  {'bootstrap': [False], 'n_estimators': [3, 10], 
                   'max_features': [2, 3, 4]},]
    forest_reg = RandomForestRegressor()
    grid_search = RandomizedSearchCV(forest_reg, param_grid, cv=5, 
                               scoring='neg_mean_squared_error',
                               return_train_score=True, n_iter=10)
    return grid_search

In [4]:
# Load data
cs_file = '../data/SMB_input_2011_2015.csv'

df = pd.read_csv(cs_file,
   delimiter='\t', index_col=['Date/Time'],
    parse_dates=['Date/Time'], na_values='NAN',date_parser=date_parser)

In [5]:
loso = LeaveOneGroupOut()
groups = df['Event'].values

In [6]:
X = (df.drop(['Diff', 'Event'], axis=1)).to_numpy()
y = (df['Diff'].copy()).to_numpy()

In [7]:
# Leave-One-Stake-Out
i = 1
test_rmse = []
for train_index, test_index in loso.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = RF_model().fit(X_train, y_train)
    joblib.dump(model, '../RF/LOSO/CV/stake_'+str(i)+'_model.h5')
    grid_ytest_p = model.predict(X_test)
    grid_test_mse = mean_squared_error(y_test, grid_ytest_p)
    grid_test_rmse = np.sqrt(grid_test_mse)
    test_rmse.append(grid_test_rmse)
    i= i+1

In [8]:
print(test_rmse)

[0.16554450027262407, 0.10715125115928417, 0.10269218844201494, 0.14540688556988635, 0.12179469267972416, 0.13962514490500483, 0.12870825076812084, 0.10309892768059661, 0.10761392509800954, 0.08597131062162541, 0.13950297711876974]


In [9]:
df['Year'] = df.index.year

In [10]:
loyo = LeaveOneGroupOut()
groups = df['Year'].values

In [11]:
X = (df.drop(['Diff', 'Event', 'Year'], axis=1)).to_numpy()
y = (df['Diff'].copy()).to_numpy()

In [12]:
# Leave-One-Year-Out
i = 1
test_rmse = []
for train_index, test_index in loso.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = RF_model().fit(X_train, y_train)
    joblib.dump(model, '../RF/LOYO/CV/year_'+str(i)+'_model.h5')
    grid_ytest_p = model.predict(X_test)
    grid_test_mse = mean_squared_error(y_test, grid_ytest_p)
    grid_test_rmse = np.sqrt(grid_test_mse)
    test_rmse.append(grid_test_rmse)
    i= i+1

In [13]:
print(test_rmse)

[1.2051418081610885, 0.30236042923416195, 0.19120358662551687, 0.16061654173554932, 0.14560726409469707]


In [14]:
X = (df.drop(['Diff', 'Event', 'Year'], axis=1)).to_numpy()
y = (df['Diff'].copy()).to_numpy()

In [15]:
groups = df['Event'].values
years = df.drop_duplicates(subset=['Year'])['Year'].values

In [16]:
i = 1 
test_rmse = []
for year, (train_index, test_index) in zip(years, loso.split(X, y, groups)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = RF_model().fit(X_train, y_train)
    joblib.dump(model, '../RF/LOYSO/CV/year_stake_'+str(i)+'_model.h5')
    grid_ytest_p = model.predict(X_test)
    grid_test_mse = mean_squared_error(y_test, grid_ytest_p)
    grid_test_rmse = np.sqrt(grid_test_mse)
    test_rmse.append(grid_test_rmse)
    i= i+1    

In [17]:
print(test_rmse)

[0.1646137670661002, 0.0970421944310824, 0.0999209094450701, 0.1381629682620096, 0.12268881934289115]
