# 1. Import

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import pickle

# graph
import pathlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from datetime import datetime


# model
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

#  for multicolinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor 

# 2. Import Files

In [3]:
# 1. to make sure month and time column is read as string
string_cols = ['Month', 'Time']
dict_dtypes = {x : 'str'  for x in string_cols}

# 2. get all year data
all_year = pd.read_csv('./Data/station.csv', index_col = 0, dtype = dict_dtypes)
weather_j9 = pd.read_csv('./Data/station_j9.csv', index_col = 0, dtype = dict_dtypes)
envi_j9 = pd.read_csv('./Data/envimet_j9.csv', index_col = 0, dtype = dict_dtypes)

# 3. get score data
# scores_df = pd.read_csv('./Data/score', index_col = 0)

# 3. XG Boost

In [18]:
all_year.drop(columns = ['Month', 'Time'], inplace = True)

In [19]:
Y = all_year['CHWTON']
X = all_year.drop(labels = ['CHWTON'], axis = 1)

In [26]:
all_year

Unnamed: 0,Month_num,Hour_num,Minute_num,Air Temp,Rel Humid,KW,CHWTON
01/01/2018 00:00,1,0,0,11.0,40.24,138.33,1.09
01/01/2018 00:15,1,0,15,11.0,40.24,90.14,0.99
01/01/2018 00:30,1,0,30,10.0,43.01,80.49,1.01
01/01/2018 00:45,1,0,45,9.0,46.00,84.88,0.90
01/01/2018 01:00,1,1,0,9.0,46.00,85.83,0.87
...,...,...,...,...,...,...,...
12/31/2018 22:00,12,22,0,6.0,93.30,124.28,0.00
12/31/2018 22:15,12,22,15,5.0,100.00,127.61,0.00
12/31/2018 22:30,12,22,30,5.0,100.00,130.48,0.00
12/31/2018 22:45,12,22,45,6.0,93.30,133.01,0.00


## 4.2 Randomized Search

In [13]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import time

In [22]:
# Train test split
# 1. Split into train test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=20)

In [23]:
# A parameter grid for XGBoost
params = {
    'n_estimators':[50, 100 , 250],
    'min_child_weight':[4,5], 
    'gamma':[i/10.0 for i in range(3,6)],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)], 
    'max_depth': [2,3,4,6,7],
    'objective': ['reg:squarederror', 'reg:tweedie'],
    'booster': ['gbtree', 'gblinear'],
    'eval_metric': ['rmse'],
    'eta': [i/10.0 for i in range(3,6)],
}

reg = XGBRegressor()

n_iter_search = 20
xgb_random = RandomizedSearchCV(reg,
                                param_distributions = params,
                                n_iter = n_iter_search,
                                cv = 5,
                                verbose = 2,
                                random_state = 20,
                                scoring ='r2',
                                n_jobs = -1)

start = time.time()
xgb_random.fit(X_train, Y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))



Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.4min finished


RandomizedSearchCV took 188.19 seconds for 20 candidates parameter settings.


In [25]:
# 1. print winning set of hyperparameters
print(xgb_random.best_estimator_.get_params())
print(xgb_random.best_score_)

# 2. save model
xgb_best = xgb_random.best_estimator_

{'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1.0, 'gamma': 0.3, 'gpu_id': -1, 'importance_type': 'gain', 'interaction_constraints': '', 'learning_rate': 0.300000012, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 4, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 250, 'n_jobs': 0, 'num_parallel_tree': 1, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 0.9, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'eval_metric': 'rmse', 'eta': 0.3}
0.9205243432644133


In [None]:
# June 9th prediction
# 1. get X
X_j9_dum = j9_dum.drop(labels = ['CHWTON', 'Month', 'Time'], axis = 1)

# 2. Ypred
Y_pred_xgb_j9 = xgb_best.predict(X_j9_dum)

# 3. Score
R2_j9_xgb = xgb_best.score(X_j9_dum, Y_j9)
RMSE_j9_xgb = np.sqrt(metrics.mean_squared_error(Y_j9, Y_pred_xgb_j9))

# 4. append to score df
score_J9_xgb = [R2_j9_xgb, RMSE_j9_xgb]
scores_df['XGB_J9_AZ'] = score_J9_xgb
scores_df