<!-- Data: Combined with added features , Shades and Radiation. <br>
Split: Train, Validate, Test -->

# 1. Import Library - Load Data - Functions

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# parameters search
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# models
from sklearn.ensemble import RandomForestRegressor

# explain
from sklearn.tree import export_graphviz

from datetime import datetime
import os
import pathlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.pyplot import figure

In [2]:
## FUNCTIONS USED IN NOTEBOOK ##

def undummify(df, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [None]:
# Bldg = pd.read_csv("/Users/alialyakoob/Dropbox/Mac/Desktop/ASU/RA/Data/Model_Data/microclimate_model/Combined/Summer Days/summer_limited2.csv")
# #Bldg1 = pd.read_csv("/content/drive/Shareddrives/Microclimate-Building Energy/Data/microclimate_model/Combined/all_buildings_limited.csv")
# Bldg = Bldg.drop(columns = ['Unnamed: 0','CHWTON'])
# Bldg['Date_Time'] = pd.to_datetime(Bldg['Date_Time'])


 # >>>> changed here:
filepath_summer = "../Data/microclimate_model/Combined/summer_limited2.csv"
filepath_june9_trees = "../Data/microclimate_model/Combined/June_9_trees.csv"
filepath_bldg1 = "../Data/microclimate_model/Combined/all_buildings_limited.csv"

Bldg = pd.read_csv(filepath_summer)
Bldg = Bldg.drop(columns = ['Unnamed: 0','CHWTON'])
Bldg['Date_Time'] = pd.to_datetime(Bldg['Date_Time'])

In [None]:
# >>>>>> not needed
# Tree_X = pd.read_csv("/Users/alialyakoob/Dropbox/Mac/Desktop/ASU/RA/Data/Model_Data/microclimate_model/Combined/Trees_Scenario/June_9_trees.csv")
# Tree_X = Tree_X.drop(columns = ['Unnamed: 0','CHWTON','CHWTON/SQFT' ])
# Tree_X['Date_Time'] = pd.to_datetime(Tree_X['Date_Time'])
# Tree_X.set_index(Tree_X['Date_Time'], inplace=True)

# 2. Data Visualization

In [None]:
# Add CHWTON/SQM 
Bldg['CHWTON/SQM'] = Bldg['CHWTON/SQFT']/0.092903


In [None]:
Bldg.info()
Bldg.describe()

In [None]:
Bldg.columns

In [None]:
sns.displot(Bldg, x="CHWTON/SQM", hue="bldgname", kind="kde", fill=True, height=10, aspect=2)

In [None]:
plt.figure(figsize=(10, 10))
Bldg[['Air Temp', 'Abs Hum']].boxplot()

Set Date_Time as index

In [None]:
Bldg.set_index(Bldg['Date_Time'], inplace=True)

In [None]:
b = Bldg[(Bldg['bldgname'] == 'ISTB 4') | (Bldg['bldgname'] == 'Psychology' )
        | (Bldg['bldgname'] == 'Psychology North')]

In [None]:
b[b['bldgname'] == 'ISTB 4']

Visualize Time Series By Day

In [None]:
fig, ax = plt.subplots(15,2,figsize=(20,120))
b = Bldg
counter = -1
for i in b['Month'].unique():
    m = b[(b['Month'] == i)]
    for z in m['Day'].unique():
        d = m[(m['Day'] == z)]
        counter+=1
        for un in d['bldgname'].unique():
            bl = d[(d['bldgname'] == un)]
            ax[counter,0].plot((bl['CHWTON/SQM']/bl['CHWTON/SQM'].max()), label = bl['bldgname'][0])
            ax[counter,0].set_title(str(bl['Month'].iloc[0]) + '/' + str(bl['Day'].iloc[0]), fontsize = 15)
            ax[counter,0].legend()
            ax[counter,1].plot(bl['Air Temp'], label = bl['bldgname'][0])
            ax[counter,1].set_title(str(bl['Month'].iloc[0]) + '/' + str(bl['Day'].iloc[0]), fontsize = 15)
            ax[counter,1].legend()

Clean Data based on Visualization

In [None]:
Bldg = Bldg.drop(Bldg.loc[pd.date_range('2018-06-27 05:00:00','2018-06-27 12:30:00', freq = '15min')].index)
Bldg = Bldg.drop(Bldg.loc[pd.date_range('2018-08-01 05:00:00','2018-08-01 20:45:00', freq = '15min')].index)
Bldg = Bldg.drop(Bldg.loc[pd.date_range('2018-08-03 05:00:00','2018-08-03 20:45:00', freq = '15min')].index)
Bldg = Bldg.drop(Bldg.loc[pd.Timestamp('2018-06-27 20:00:00')].index)
Bldg = Bldg[Bldg['bldgname'] != 'Biodesign C']
Bldg = Bldg[Bldg['bldgname'] != 'Noble Library']
Bldg = Bldg[Bldg['bldgname'] != 'Bulldog Hall']

In [None]:
fig, ax = plt.subplots(12,2,figsize=(20,120))
b = Bldg
counter = -1
for i in b['Month'].unique():
    m = b[(b['Month'] == i)]
    for z in m['Day'].unique():
        d = m[(m['Day'] == z)]
        counter+=1
        for un in d['bldgname'].unique():
            bl = d[(d['bldgname'] == un)]
            ax[counter,0].plot((bl['CHWTON/SQM']/bl['CHWTON/SQM'].max()), label = bl['bldgname'][0])
            ax[counter,0].set_title(str(bl['Month'].iloc[0]) + '/' + str(bl['Day'].iloc[0]), fontsize = 15)
            ax[counter,0].legend()
            ax[counter,1].plot(bl['Air Temp'], label = bl['bldgname'][0])
            ax[counter,1].set_title(str(bl['Month'].iloc[0]) + '/' + str(bl['Day'].iloc[0]), fontsize = 15)
            ax[counter,1].legend()

In [None]:
fig, axes = plt.subplots(3,1,figsize=(10,15))

for i in b['Month'].unique():
    m = b[(b['Month'] == i)]
    for z in m['Day'].unique():
        d = m[(m['Day'] == z)]
        d[d['bldgname'] == 'ISTB 4'].plot(y = 'CHWTON/SQFT', x = 'Hour', ax = axes[0], label = str(d['Month'][0]) + '/' + str(d['Day'][0]));
        d[d['bldgname'] == 'Psychology'].plot(y = 'CHWTON/SQFT', x = 'Hour', ax = axes[1], label = str(d['Month'][0]) + '/' + str(d['Day'][0]));
        d[d['bldgname'] == 'Psychology'].plot(y = 'CHWTON/SQFT', x = 'Hour', ax = axes[2], label = str(d['Month'][0]) + '/' + str(d['Day'][0]));


In [None]:
corrMatrix = Bldg.corr()
plt.figure(figsize=(15,15))
sns.heatmap(corrMatrix, cmap='RdYlGn', annot = True, linewidths = 1)
plt.show()

In [None]:
# ## CHOOSE Y TO PLOT ##
# y = 'HTmmBTU'
# df = J9.copy()
# df = undummify(df)
# for i in df['bldgname'].unique():
#     b = df[df['bldgname'] == i]
#     c = undummify(Scen1)
#     c = (c[c['bldgname'] == i])
#     ax = b.plot(y = y, label = "Baseline", 
#                 ylabel = y, title = i)
#     c.plot(y = y, label = "Scenario",
#          ylabel = y, ax=ax)

# 3. Random Forests Model

In [None]:
### PREPROCESSING DATA FOR MODEL ###
# , 'DSW Top', 'DSW North','DSW East','DSW South','DSW West'
MD = Bldg.copy()
# Remove Columns
MD = MD.drop(columns = ['Rel Hum', 'Minute','CHWTON/SQFT'])

# Select ISTB4, Psychology, and Psychology North
MD = MD[(MD['bldgname'] == 'ISTB 4') | (MD['bldgname'] == 'Psychology') | (MD['bldgname'] == 'Psychology North')]

# One hot encode building names
MD = pd.get_dummies(MD)
# Remove June 9 from data and save to new df
J9 = MD.loc['2018-06-09']
MD.drop(MD[(MD['Day'] == 9) & (MD['Month'] == 6)].index, inplace = True)
# Check if June 9 data is still in MD
MD.loc['2018-06-09']

In [None]:
J9

In [None]:
Psyc = J9[J9['bldgname_Psychology'] == 1]
Psyc_N = J9[J9['bldgname_Psychology North'] == 1]
ISTB4  = J9[J9['bldgname_ISTB 4'] == 1]

In [None]:
fig, ax = plt.subplots()
yplot = 'Abs Hum'
ylab = 'Abs Hum'
title = 'Absolute Humidity - June 9, 2018'
Psyc.plot(x = 'Date_Time', y = yplot, label = 'Psychology', xlabel = 'Time', ylabel = ylab, title = title,ax = ax)
Psyc_N.plot(x = 'Date_Time', y = yplot, label = 'Psychology North', xlabel = 'Time', ylabel = ylab, ax = ax)
ISTB4.plot(x = 'Date_Time', y = yplot, label = 'ISTB 4', xlabel = 'Time', ylabel = ylab, ax = ax)

In [None]:
fig, ax = plt.subplots()
yplot = 'Air Temp'
ylab = 'Air Temp'
title = 'Air Temperature - June 9, 2018'
Psyc.plot(x = 'Date_Time', y = yplot, label = 'Psychology', xlabel = 'Time', ylabel = ylab, title = title,ax = ax)
Psyc_N.plot(x = 'Date_Time', y = yplot, label = 'Psychology North', xlabel = 'Time', ylabel = ylab, ax = ax)
ISTB4.plot(x = 'Date_Time', y = yplot, label = 'ISTB 4', xlabel = 'Time', ylabel = ylab, ax = ax)

In [None]:
fig, ax = plt.subplots()
yplot = 'DSW South'
ylab = 'DSW South'
title = 'Direct Shortwave (South) - June 9, 2018'
Psyc.plot(x = 'Date_Time', y = yplot, label = 'Psychology', xlabel = 'Time', ylabel = ylab, title = title,ax = ax)
Psyc_N.plot(x = 'Date_Time', y = yplot, label = 'Psychology North', xlabel = 'Time', ylabel = ylab, ax = ax)
ISTB4.plot(x = 'Date_Time', y = yplot, label = 'ISTB 4', xlabel = 'Time', ylabel = ylab, ax = ax)

In [None]:
fig, ax = plt.subplots()
yplot = 'Shade South'
ylab = 'Shade South'
title = 'Shade (South) - June 9, 2018'
Psyc.plot(x = 'Date_Time', y = yplot, label = 'Psychology', xlabel = 'Time', ylabel = ylab, title = title,ax = ax)
Psyc_N.plot(x = 'Date_Time', y = yplot, label = 'Psychology North', xlabel = 'Time', ylabel = ylab, ax = ax)
ISTB4.plot(x = 'Date_Time', y = yplot, label = 'ISTB 4', xlabel = 'Time', ylabel = ylab, ax = ax)

In [None]:
J9

In [None]:
# export MD as csv
# dropcols = ['Date_Time', 'Month', 'Hour', 'Day']
# MD_export =  MD.drop(columns=dropcols)
# train_filepath ="././Data/microclimate_model/Combined/three_bldgs_dropped.csv"
# MD_export.to_csv(train_filepath)

# # export J9 as csv
# test_filepath ="././Data/microclimate_model/Combined/three_bldgs_J9_dropped.csv"
# J9_export = J9.drop(columns=dropcols)
# J9_export.to_csv(test_filepath)

## 3.1 RandomForests no Tuning

In [None]:
## TRAIN-VALIDATE SPLIT --> FIT RF MODEL --> GET SCORE ##

# 1. Get X and y
dropcols = ['CHWTON/SQM', 'Date_Time', 'Month', 'Hour', 'Day']
X = MD.drop(columns=dropcols)
y = MD['CHWTON/SQM']  
    
# 2. Train-Validate Split
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.3, random_state=20)

# 3. RF
RF_base = RandomForestRegressor(n_estimators = 100, random_state = 42, oob_score = True)

# # 4. Fit model that already has parameters
RF_base.fit(X_train, y_train)

In [None]:
## FEATURE IMPORTANCE ##
pd.DataFrame({"Features":RF_base.feature_names_in_, 
              "Feature Importance":RF_base.feature_importances_}).sort_values(by='Feature Importance', ascending=False)

In [None]:
print('criterion: ', RF_base.criterion)
print('max_depth: ', RF_base.max_depth)
print('min_samples_split: ', RF_base.min_samples_split)
print('min_samples_leaf: ', RF_base.min_samples_leaf)
print('min_weight_fraction_leaf: ', RF_base.min_weight_fraction_leaf)
print('max_features: ', RF_base.max_features)
print('max_leaf_nodes: ', RF_base.max_leaf_nodes)
print('min_impurity_decrease: ', RF_base.min_impurity_decrease)
print('random_state: ', RF_base.random_state)
print('ccp_alpha: ', RF_base.ccp_alpha)

In [None]:
## TEST AND VALIDATION SCORES ##

# Validation Score on All
scoreValidate = RF_base.score(X_validate, y_validate)

# Test Score on All
X_test = J9.drop(columns=dropcols)
Y_test = J9['CHWTON/SQM']
Y_pred = RF_base.predict(X_test)
scoreTest = RF_base.score(X_test, Y_test)

# J
J9B = []
ScoresT = []
for i in ['ISTB 4', 'Psychology', 'Psychology North']:
    b = J9[J9['bldgname_' + i] == 1]
    J9B.append(b)
    X_test = b.drop(columns=dropcols)
    y_test = b['CHWTON/SQM']
    ScoresT.append(RF_base.score(X_test, y_test))

AllBuildingsTest = pd.DataFrame({'Building':['ISTB 4', 'Psychology', 'Psychology North'], 
                                 '[Test Scores (June 9) Individual]': ScoresT,
                                 '[Test Score (June 9) All]' : scoreTest,
                                 '[Validation Score All]': scoreValidate})
AllBuildingsTest

In [None]:
### JUNE 9 SENSITIVITY ###

## AIR TEMP AND ABS HUM ##
AirTdelta = -0.5
AbsHdelta = 0
Scen1 = J9.copy()
Scen1['Air Temp'] = Scen1['Air Temp'] + AirTdelta
Scen1['Abs Hum'] = Scen1['Abs Hum'] + AbsHdelta

## SHADING ##
shadred = 0

# SINGLE FACADE #
f = 'Shade South'
Scen1[f] = Scen1[f] + shadred
Scen1.loc[Scen1[f] > 1, f] = 1

# ALL FACADES #
# shadcol = ['Shade East', 'Shade West', 'Shade North', 'Shade South']
# for i in shadcol: 
#   Scen1[i] = Scen1[i] + shadred
#   Scen1.loc[Scen1[i] + shadred > 1, i] = 1


X_scentest = Scen1.drop(columns=dropcols)
Y_scenpred = RF_base.predict(X_scentest)

In [None]:
## PREDICTION PLOTS ##
X_test = J9.drop(columns=dropcols)
Y_test = J9['CHWTON/SQM']
Y_pred = RF_base.predict(X_test)
df = X_test.copy()
df['Predicted'] = Y_pred
df['Actual'] = Y_test
df['Scenario'] = Y_scenpred
df = undummify(df)
for i in df['bldgname'].unique():
    b = df[df['bldgname'] == i]
    l = str(round(((b['Predicted'].mean() - 
                    b['Actual'].mean())/b['Actual'].mean())*100,2)) + " %"
    ax = b.plot(y = 'Predicted', label = "Predicted", 
                ylabel = 'CHWTON/SQM', title = i + "    [Avg Diff:  " + l + ']')
    b.plot(y = 'Actual', label = 'Actual', ax=ax)

In [None]:
## SCENARIO PLOTS ##
X_test = J9.drop(columns=dropcols)
Y_test = J9['CHWTON/SQM']
Y_pred = RF_base.predict(X_test)
df = X_test.copy()
df['Predicted'] = Y_pred
df['Actual'] = Y_test
df['Scenario'] = Y_scenpred
df = undummify(df)
for i in df['bldgname'].unique():
    b = df[df['bldgname'] == i]
    l = str(round(((b['Scenario'].mean() - 
                    b['Predicted'].mean())/b['Predicted'].mean())*100,2)) + " %"
    ax = b.plot(y = 'Predicted', label = "Predicted", 
                ylabel = 'CHWTON/SQM', title = i + "    [Avg Diff:  " + l + ']')
    b.plot(y = 'Scenario', label = 'Scenario', ax=ax)

## 3.2 Random Forests Tuned

In [None]:
param_test = {'n_estimators': [100,200,300,400],
              'criterion': ['squared_error', 'absolute_error', 'poisson'],
              'max_depth': [None, 100,200,300],
              'max_features': ['auto', 'log2', 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
             }

rf = RandomForestRegressor()
rfgs = RandomizedSearchCV(
    estimator = rf, param_distributions=param_test,
    cv=3,
    refit=True,
    random_state=42,
    verbose=True)
rfgs.fit(X_train, y_train)

In [None]:
rfgs.best_params_

In [None]:
RF_tuned = RandomForestRegressor(n_estimators = 400,
                                 max_features = 0.6,
                                 random_state = 42,
                                 max_depth = 300,
                                 criterion = 'absolute_error')
RF_tuned.fit(X_train, y_train)

In [None]:
# save the best model
import pickle
filename = 'rf_ali.sav'
pickle.dump(RF_tuned, open(filename, 'wb'))

# try load
loaded_model_name = "rf_ali"
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
## FEATURE IMPORTANCE ##
pd.DataFrame({"Features":RF_tuned.feature_names_in_, 
              "Feature Importance":RF_tuned.feature_importances_}).sort_values(by='Feature Importance', ascending=False)

In [None]:
print('criterion: ', RF_tuned.criterion)
print('max_depth: ', RF_tuned.max_depth)
print('min_samples_split: ', RF_tuned.min_samples_split)
print('min_samples_leaf: ', RF_tuned.min_samples_leaf)
print('min_weight_fraction_leaf: ', RF_tuned.min_weight_fraction_leaf)
print('max_features: ', RF_tuned.max_features)
print('max_leaf_nodes: ', RF_tuned.max_leaf_nodes)
print('min_impurity_decrease: ', RF_tuned.min_impurity_decrease)
print('random_state: ', RF_tuned.random_state)
print('ccp_alpha: ', RF_tuned.ccp_alpha)

In [None]:
## TEST AND VALIDATION SCORES ##

# Validation Score on All
scoreValidate = RF_tuned.score(X_validate, y_validate)

# Test Score on All
X_test = J9.drop(columns=dropcols)
Y_test = J9['CHWTON/SQM']
Y_pred = RF_tuned.predict(X_test)
scoreTest = RF_tuned.score(X_test, Y_test)

# J
J9B = []
ScoresT = []
for i in ['ISTB 4', 'Psychology', 'Psychology North']:
    b = J9[J9['bldgname_' + i] == 1]
    J9B.append(b)
    X_test = b.drop(columns=dropcols)
    y_test = b['CHWTON/SQM']
    ScoresT.append(RF_tuned.score(X_test, y_test))

AllBuildingsTest = pd.DataFrame({'Building':['ISTB 4', 'Psychology', 'Psychology North'], 
                                 '[Test Scores (June 9) Individual]': ScoresT,
                                 '[Test Score (June 9) All]' : scoreTest,
                                 '[Validation Score All]': scoreValidate})
AllBuildingsTest

# 4.0 Scenario Setting

In [None]:
### JUNE 9 SENSITIVITY ###

## AIR TEMP AND ABS HUM ##
AirTdelta = -0.5
AbsHdelta = -0.5
Scen1 = J9.copy()
Scen1['Air Temp'] = Scen1['Air Temp'] + AirTdelta
# Scen1['Abs Hum'] = Scen1['Abs Hum'] + AbsHdelta

## SHADING ##
shadred = 0.2

# SINGLE FACADE #
# f = 'Shade East'
# Scen1[f] = Scen1[f] + shadred
# Scen1.loc[Scen1[f] > 1, f] = 1

# ALL FACADES #
# shadcol = ['Shade East', 'Shade West', 'Shade North', 'Shade South']
# for i in shadcol: 
#     Scen1[i] = Scen1[i] + shadred
#     Scen1.loc[Scen1[i] + shadred > 1, i] = 1


X_scentest = Scen1.drop(columns=dropcols)
Y_scenpred = RF_tuned.predict(X_scentest)

# 4.1 Tuned Plot: Prediction VS Actual

In [None]:
## PREDICTION PLOTS ##
X_test = J9.drop(columns=dropcols)
Y_test = J9['CHWTON/SQM']
Y_pred = RF_tuned.predict(X_test)
df = X_test.copy()
df['Predicted'] = Y_pred
df['Actual'] = Y_test
df = undummify(df)
blist = []
for i in df['bldgname'].unique():
    b = df[df['bldgname'] == i]
    blist.append(b)
    l = str(round(((b['Predicted'].mean() - 
                    b['Actual'].mean())/b['Actual'].mean())*100,2)) + " %"
    ax = b.plot(y = 'Predicted', label = "Predicted", 
                ylabel = 'CHWTON/SQM', title = i + "    [Avg Diff:  " + l + ']')
    b.plot(y = 'Actual', label = 'Actual', ax=ax)

In [None]:
# b = blist[2]

# l = str(round(((b['Predicted'].mean() - 
#                 b['Actual'].mean())/b['Actual'].mean())*100,2)) + " %"
# ax = b.plot(y = 'Predicted', label = "Predicted", 
#             ylabel = 'CHWTON/SQM', title = b['bldgname'][0] + "    [Avg Diff:  " + l + ']')
# b.plot(y = 'Actual', label = 'Actual', ax=ax)

In [None]:
# X_test['Shade North'].plot()

In [None]:
# X_scentest['Shade North'].plot()

# 4.2 Scenario Plot

## 4.2.1 RF

In [None]:
## SCENARIO PLOTS RF ##
X_test = J9.drop(columns=dropcols)
Y_test = J9['CHWTON/SQM']


Y_pred = RF_tuned.predict(X_test)

df = X_test.copy()
df['Predicted'] = Y_pred
df['Scenario'] = Y_scenpred
df = undummify(df)
cols_to_keep = ["bldgname", "Scenario", "Predicted"]
df = df.drop(df.columns.difference(cols_to_keep), 1)

for i in df['bldgname'].unique():
    b = df[df['bldgname'] == i]
    l = str(round(((b['Scenario'].mean() - 
                    b['Predicted'].mean())/b['Predicted'].mean())*100,2)) + " %"
    ax = b.plot(y = 'Predicted', label = "Predicted", 
                ylabel = 'CHWTON/SQM', title = i + "    [Avg Diff:  " + l + ']')
    b.plot(y = 'Scenario', label = 'Scenario', ax=ax)


## 4.2.2 CB

In [None]:
MODEL_PATH = 'cb_best.sav'
cb_name = "cb_best"
cb_best = pickle.load(open(MODEL_PATH, 'rb'))

In [None]:
### JUNE 9 SENSITIVITY ###

## AIR TEMP AND ABS HUM ##
AirTdelta = -0.5
AbsHdelta = -0.5
Scen1 = J9.copy()
# Scen1['Air Temp'] = Scen1['Air Temp'] + AirTdelta
# Scen1['Abs Hum'] = Scen1['Abs Hum'] + AbsHdelta

## SHADING ##
shadred = 0.2

# SINGLE FACADE #
f = 'Shade West'
Scen1[f] = Scen1[f] + shadred
Scen1.loc[Scen1[f] > 1, f] = 1

# ALL FACADES #
# shadcol = ['Shade East', 'Shade West', 'Shade North', 'Shade South']
# for i in shadcol: 
#     Scen1[i] = Scen1[i] + shadred
#     Scen1.loc[Scen1[i] + shadred > 1, i] = 1


X_scentest = Scen1.drop(columns=dropcols)
Y_scenpred = cb_best.predict(X_scentest)

In [None]:
## SCENARIO PLOTS CB##
X_test = J9.drop(columns=dropcols)
X_test_cb = X_test.drop(columns = ["bldgname_ISTB 4"])
Y_test = J9['CHWTON/SQM']


Y_pred = cb_best.predict(X_test_cb)

df = X_test.copy()
df["Actual"] = Y_test
df['Predicted'] = Y_pred
df['Scenario'] = Y_scenpred
df = undummify(df)
cols_to_keep = ["bldgname", "Scenario", "Predicted", "Actual"]
df = df.drop(df.columns.difference(cols_to_keep), 1)
df


In [None]:
for i in df['bldgname'].unique():
    b = df[df['bldgname'] == i]
    blist.append(b)
    l = str(round(((b['Scenario'].mean() - 
                    b['Predicted'].mean())/b['Predicted'].mean())*100,2)) + " %"
    ax = b.plot(y = 'Predicted', label = "Predicted", 
                ylabel = 'CHWTON/SQM', title = i + "    [Avg Diff:  " + l + ']')
    b.plot(y = 'Scenario', label = 'Scenario', ax=ax)


In [None]:
# b = blist[2]
# i = b['bldgname'][0]
# l = str(round(((b['Scenario'].mean() - 
#                 b['Predicted'].mean())/b['Predicted'].mean())*100,2)) + " %"
# ax = b.plot(y = 'Predicted', label = "Predicted", 
#             ylabel = 'CHWTON/SQM', title = i + "    [Avg Diff:  " + l + ']')
# b.plot(y = 'Scenario', label = 'Scenario', ax=ax)

## Model Interpretation: SHAP

In [None]:
## Using Feature Importance
feature_imp= pd.DataFrame(sorted(zip(RF_tuned.feature_importances_, X_train.columns), reverse = True), columns = ['Value', 'Feature'])

plt.figure(figsize=(7,5))
sns.barplot(x='Value', y='Feature', data=feature_imp[:20].sort_values(by='Value', ascending=False))
plt.tight_layout()
plt.show()

In [None]:
import math
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
def MBE(y_true, y_pred):
    '''
    Parameters:
        y_true (array): Array of observed values
        y_pred (array): Array of prediction values

    Returns:
        mbe (float): Biais score
    '''
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_true = y_true.reshape(len(y_true),1)
    y_pred = y_pred.reshape(len(y_pred),1)   
    diff = (y_pred-y_true)
    mbe = diff.mean()
    print('MBE = ', mbe)

In [None]:
pred = RF_base.predict(X_test)
r2 = r2_score(Y_test, pred)
rmse = math.sqrt(mean_squared_error(Y_test, pred))
MBE(Y_test, pred)
print("r2 score: ", r2)
print("rmse score: ", rmse)

In [None]:
pred

In [None]:
pred = RF_tuned.predict(X_test)
r2 = r2_score(Y_test, pred)
rmse = math.sqrt(mean_squared_error(Y_test, pred))
MBE(Y_test, pred)
print("tuned r2 score: ", r2)
print("tuned rmse score: ", rmse)

In [None]:
import shap
explainer = shap.TreeExplainer(model=RF_tuned,
                               data=None,
                               model_output='raw',
                               feature_perturbation='tree_path_dependent')

shap_values = explainer.shap_values(X_test)

In [None]:
# visualize the first prediction's explanation
shap.initjs()
i = 7
shap.force_plot(explainer.expected_value, shap_values[i,:], X_test.iloc[i,:])

In [None]:
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_size = [8,6], show = False)
plt.gcf().axes[-1].set_aspect(10)
plt.gcf().axes[-1].set_box_aspect(10)