In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import joblib
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [42]:
def RF_model():
    param_grid = [{'n_estimators': [5, 10, 20, 40], 'max_features': [2, 4, 6, 8]},
                  {'bootstrap': [False], 'n_estimators': [3, 10], 
                   'max_features': [2, 3, 4]},]
    forest_reg = RandomForestRegressor()
    grid_search = RandomizedSearchCV(forest_reg, param_grid, cv=3, 
                               scoring='neg_mean_squared_error',
                               return_train_score=True, n_iter=50)
    return grid_search

In [60]:
# Load data
cs_file = '../data/SMB_input_four_ERA5.csv'

df = pd.read_csv(cs_file,
   delimiter='\t', index_col=['Date'],
    parse_dates=['Date'], na_values='NAN')
labels = df.drop_duplicates(subset=['Stake'])

In [53]:
df['Stake'].count()

1312

In [12]:
loso = LeaveOneGroupOut()
groups = df['Stake'].values

In [43]:
X = (df.drop(['SMB', 'Stake'], axis=1)).to_numpy()
y = (df['SMB'].copy()).to_numpy()

In [26]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [38]:
#model = RF_model().fit(X_train, y_train)

In [18]:
# Leave-One-Stake-Out
i = 1
test_rmse = []
for train_index, test_index in loso.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = RF_model().fit(X_train, y_train)
    joblib.dump(model, '../RF/LOSO_ERA5/CV/stake_'+str(i)+'_model.h5')
    grid_ytest_p = model.predict(X_test)
    grid_test_mse = mean_squared_error(y_test, grid_ytest_p)
    grid_test_rmse = np.sqrt(grid_test_mse)
    test_rmse.append(grid_test_rmse)
    i= i+1

In [41]:
#results = ['mean_test_score', 'mean_train_score']
#pd.DataFrame(model.cv_results_)[results].plot()


In [51]:
test_rmse = pd.DataFrame(test_rmse, columns={'RMSE'}, index=labels.Stake.values)
print(test_rmse)

              RMSE
PG01      0.999659
PG02      1.215284
PG03      0.605289
PG04      0.372329
PG05      0.198193
PG06      0.157690
PG07      0.302564
PG08      0.140409
PG09      0.134835
PG11      0.982713
PG12      1.136573
PG13      0.822319
PG14      0.417264
PG15      0.192951
PG16_AWS  0.195542
PG16      0.186094
PG17      0.146144
PG18      0.119726
PG19      0.195991


In [9]:
df['Year'] = df.index.year

In [10]:
loyo = LeaveOneGroupOut()
groups = df['Year'].values

In [11]:
X = (df.drop(['Diff', 'Event', 'Year'], axis=1)).to_numpy()
y = (df['Diff'].copy()).to_numpy()

In [12]:
# Leave-One-Year-Out
i = 1
test_rmse = []
for train_index, test_index in loso.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = RF_model().fit(X_train, y_train)
    joblib.dump(model, 'RF/LOYO_ERA5/CV/year_'+str(i)+'_model.h5')
    grid_ytest_p = model.predict(X_test)
    grid_test_mse = mean_squared_error(y_test, grid_ytest_p)
    grid_test_rmse = np.sqrt(grid_test_mse)
    test_rmse.append(grid_test_rmse)
    i= i+1

In [13]:
print(test_rmse)

[1.4102063112459033, 0.35497295244806604, 0.18113917301345947, 0.1757775724900486, 0.2141380499625807, 0.23527462880969222]


In [14]:
X = (df.drop(['Diff', 'Event', 'Year'], axis=1)).to_numpy()
y = (df['Diff'].copy()).to_numpy()

In [15]:
groups = df['Event'].values
years = df.drop_duplicates(subset=['Year'])['Year'].values

In [16]:
i = 1 
test_rmse = []
for year, (train_index, test_index) in zip(years, loso.split(X, y, groups)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = RF_model().fit(X_train, y_train)
    joblib.dump(model, '../RF/LOYSO_ERA5/CV/year_stake_'+str(i)+'_model.h5')
    grid_ytest_p = model.predict(X_test)
    grid_test_mse = mean_squared_error(y_test, grid_ytest_p)
    grid_test_rmse = np.sqrt(grid_test_mse)
    test_rmse.append(grid_test_rmse)
    i= i+1    

In [17]:
print(test_rmse)

[0.1442193036000612, 0.10786149959628753, 0.10557686807243281, 0.15108059605885032, 0.125036124497064, 0.19394623220978563]
