# 1. Import

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import pickle

# graph
import pathlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from datetime import datetime


# model
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

#  for multicolinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor 

# 2. Import Files

In [None]:
string_cols = ['Month', 'Time']

dict_dtypes = {x : 'str'  for x in string_cols}

# 1. all year data
all_year = pd.read_csv('./Data//weather_station_numstr', index_col = 0, dtype = dict_dtypes)
weather_j9 = pd.read_csv('./Data/weather_j9', index_col = 0, dtype = dict_dtypes)
envi_j9 = pd.read_csv('./Data/weather_j9', index_col = 0, dtype = dict_dtypes)

# 2. score data
scores_df = pd.read_csv('./Data/score', index_col = 0)

# 3. XG Boost

In [None]:
# Prepare data
all_year_dum = pd.read_csv('./Data/weather_st', index_col = 0, dtype = dict_dtypes)
all_year_dum = all_year_dum.drop(labels = ['Hour_num', 'Month_num','Minute_num'], axis = 1)

# 1. dummify dates and time
all_year_dum = pd.get_dummies(all_year_dum, prefix=None, prefix_sep='_')

# 2. get j9 data
j9_dum = all_year_dum.iloc[15150:15246]

# 3. concat j9_dum with j9 to remove non 0 minute
j9_dum =  pd.concat([j9_dum, j9.drop(labels = ['Air Temp', 'Rel Humid', 'KW'], axis = 1)], axis = 1, join = "inner")

# 3. drop June 9th data on original data
all_year_dum = all_year_dum.drop(all_year_dum.index[15150:15246])

In [None]:
# 4. Get X and Y
X_dum = all_year_dum.drop(labels = ['CHWTON'], axis =1)

# 5. Train test split
X_train_dum, X_test_dum, Y_train, Y_test = train_test_split(X_dum, Y, test_size=0.2, random_state = 20)


## 4.2 Randomized Search

In [None]:
from xgboost import XGBRegressor
import time

In [None]:
# A parameter grid for XGBoost
params = {
    'n_estimators':[50, 100 , 250],
    'min_child_weight':[4,5], 
    'gamma':[i/10.0 for i in range(3,6)],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)], 
    'max_depth': [2,3,4,6,7],
    'objective': ['reg:squarederror', 'reg:tweedie'],
    'booster': ['gbtree', 'gblinear'],
    'eval_metric': ['rmse'],
    'eta': [i/10.0 for i in range(3,6)],
}

reg = XGBRegressor()

n_iter_search = 20
xgb_random = RandomizedSearchCV(reg,
                                param_distributions = params,
                                n_iter = n_iter_search,
                                cv = 5,
                                verbose = 2,
                                random_state = 20,
                                scoring ='r2',
                                n_jobs = -1)

start = time.time()
xgb_random.fit(X_train_dum, Y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))



In [None]:
# 1. print winning set of hyperparameters
pprint(xgb_random.best_estimator_.get_params())
pprint(xgb_random.best_score_)

# 2. save model
xgb_best = xgb_random.best_estimator_

In [None]:
# June 9th prediction
# 1. get X
X_j9_dum = j9_dum.drop(labels = ['CHWTON', 'Month', 'Time'], axis = 1)

# 2. Ypred
Y_pred_xgb_j9 = xgb_best.predict(X_j9_dum)

# 3. Score
R2_j9_xgb = xgb_best.score(X_j9_dum, Y_j9)
RMSE_j9_xgb = np.sqrt(metrics.mean_squared_error(Y_j9, Y_pred_xgb_j9))

# 4. append to score df
score_J9_xgb = [R2_j9_xgb, RMSE_j9_xgb]
scores_df['XGB_J9_AZ'] = score_J9_xgb
scores_df