# 1. Import

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

# graph
import pathlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from datetime import datetime


# model
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

#  for multicolinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor 

# 2. Import csv

## 2.1 All year data

In [2]:
string_cols = ['Month', 'Time']

dict_dtypes = {x : 'str'  for x in string_cols}

# 1. all year data
all_year = pd.read_csv('./Data/weather_st2', index_col = 0, dtype = dict_dtypes)
j9 = pd.read_csv('./Data/weather_j9', index_col = 0, dtype = dict_dtypes)

# 2. score data
scores_df = pd.read_csv('./Data/score', index_col = 0)

# 3. Random Forest

In [3]:
X = all_year.drop(labels = ['CHWTON'], axis =1)
Y = all_year['CHWTON']

In [4]:
from sklearn.model_selection import train_test_split

# 1. Train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 20)

# 2. Train
from sklearn.ensemble import RandomForestRegressor
model_RF = RandomForestRegressor(n_estimators = 100, random_state = 20)
model_RF.fit(X_train, Y_train)


RandomForestRegressor(random_state=20)

## 3.1 Tuning using RandomSearchCV

First we need to set the parameter for our RF regressor

In [5]:
from sklearn.model_selection import RandomizedSearchCV

# 1. Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 300, num = 6)]

# 2. Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 15, num = 4)]
max_depth.append(None)

# 3. Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# 4. Minimum number of samples required at each leaf node
min_samples_leaf = [ 2, 5, 10]

# 5. Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [50, 100, 150, 200, 250, 300], 'max_depth': [2, 6, 10, 15, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [2, 5, 10]}


In [6]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_grid,
                               n_iter = 20, cv = 5,
                               verbose = 2,
                               scoring ='r2',
                               random_state = 20,
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, Y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.8min finished


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'max_depth': [2, 6, 10, 15, None],
                                        'min_samples_leaf': [2, 5, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 100, 150, 200, 250,
                                                         300]},
                   random_state=20, scoring='r2', verbose=2)

In [7]:
# print winning set of hyperparameters
from pprint import pprint
pprint(rf_random.best_estimator_.get_params())
pprint(rf_random.best_score_)

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 15,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
0.9325812455674513


In [8]:
rf_best = rf_random.best_estimator_

In [9]:
# # June 9th prediction
# # 1. get X Y
X_j9 = j9.drop(labels = ['CHWTON'], axis =1)
Y_j9 = j9['CHWTON']

# 2. Ypred
Y_pred_random_j9 = rf_best.predict(X_j9)

# 3. Score
R2_j9_random = rf_best.score(X_j9, Y_j9)
RMSE_j9_random = np.sqrt(metrics.mean_squared_error(Y_j9, Y_pred_random_j9))

# 4. append to score df
score_J9_random = [R2_j9_random, RMSE_j9_random]
scores_df['RF_J9_AZ_random'] = score_J9_random
scores_df

Unnamed: 0,score_type,RF_allyear,RF_J9_AZ,RF_J9_micro,RF_J9_AZ_random
0,R2,0.937572,0.876395,0.854341,0.890315
1,RMSE,0.896675,0.491681,0.533744,0.463167


# 4. XG Boost

In [10]:
# Prepare data
all_year_dum = pd.read_csv('./Data/weather_st', index_col = 0, dtype = dict_dtypes)
all_year_dum = all_year_dum.drop(labels = ['Hour_num', 'Month_num','Minute_num'], axis = 1)

# 1. dummify dates and time
all_year_dum = pd.get_dummies(all_year_dum, prefix=None, prefix_sep='_')

# 2. get j9 data
j9_dum = all_year_dum.iloc[15150:15246]

# 3. concat j9_dum with j9 to remove non 0 minute
j9_dum =  pd.concat([j9_dum, j9.drop(labels = ['Air Temp', 'Rel Humid', 'KW'], axis = 1)], axis = 1, join = "inner")

# 3. drop June 9th data on original data
all_year_dum = all_year_dum.drop(all_year_dum.index[15150:15246])

In [11]:
# 4. Get X and Y
X_dum = all_year_dum.drop(labels = ['CHWTON'], axis =1)

# 5. Train test split
X_train_dum, X_test_dum, Y_train, Y_test = train_test_split(X_dum, Y, test_size=0.2, random_state = 20)


## 4.2 Randomized Search

In [12]:
from xgboost import XGBRegressor
import time

In [13]:
# A parameter grid for XGBoost
params = {
    'n_estimators':[50, 100 , 250],
    'min_child_weight':[4,5], 
    'gamma':[i/10.0 for i in range(3,6)],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)], 
    'max_depth': [2,3,4,6,7],
    'objective': ['reg:squarederror', 'reg:tweedie'],
    'booster': ['gbtree', 'gblinear'],
    'eval_metric': ['rmse'],
    'eta': [i/10.0 for i in range(3,6)],
}

reg = XGBRegressor()

n_iter_search = 20
xgb_random = RandomizedSearchCV(reg,
                                param_distributions = params,
                                n_iter = n_iter_search,
                                cv = 5,
                                verbose = 2,
                                random_state = 20,
                                scoring ='r2',
                                n_jobs = -1)

start = time.time()
xgb_random.fit(X_train_dum, Y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))



Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.5min finished


RandomizedSearchCV took 398.09 seconds for 20 candidates parameter settings.


In [14]:
# 1. print winning set of hyperparameters
pprint(xgb_random.best_estimator_.get_params())
pprint(xgb_random.best_score_)

# 2. save model
xgb_best = xgb_random.best_estimator_

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 0.6,
 'eta': 0.3,
 'eval_metric': 'rmse',
 'gamma': 0.4,
 'gpu_id': -1,
 'importance_type': 'gain',
 'interaction_constraints': '',
 'learning_rate': 0.300000012,
 'max_delta_step': 0,
 'max_depth': 4,
 'min_child_weight': 5,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 250,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'objective': 'reg:squarederror',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 1.0,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}
0.9061622427610543


In [15]:
# June 9th prediction
# 1. get X
X_j9_dum = j9_dum.drop(labels = ['CHWTON', 'Month', 'Time'], axis = 1)

# 2. Ypred
Y_pred_xgb_j9 = xgb_best.predict(X_j9_dum)

# 3. Score
R2_j9_xgb = xgb_best.score(X_j9_dum, Y_j9)
RMSE_j9_xgb = np.sqrt(metrics.mean_squared_error(Y_j9, Y_pred_xgb_j9))

# 4. append to score df
score_J9_xgb = [R2_j9_xgb, RMSE_j9_xgb]
scores_df['XGB_J9_AZ'] = score_J9_xgb
scores_df

Unnamed: 0,score_type,RF_allyear,RF_J9_AZ,RF_J9_micro,RF_J9_AZ_random,XGB_J9_AZ
0,R2,0.937572,0.876395,0.854341,0.890315,0.82928
1,RMSE,0.896675,0.491681,0.533744,0.463167,0.577838
