# Supervised Learning Energy Modeling

In [55]:
import pandas as pd
import numpy as np
from datetime import datetime

In [56]:
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.metrics import r2_score, mean_absolute_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb

#### Feature Importance:
From PyCaret Model Evaluation we find that the top 10 features are:
- Point_34: "KING.CC.A1SAK" - Set Point
- Point_17: "KING.CC.A1LPS" - Total Flow
- Point_37: "KING.CC.A1SAT" - Supply Air Temp
- Point_21: "KING.CC.A1MAT" - Mixed Air Temp
- Point_194: "KING.CC.MUA1SAT" - Make Up Air Unit Supply Temp
- Point_26: "KING.CC.A1MXRT" - Max Room Temp
- Point_22: "KING.CC.A1MNRE" - Min Room Error
- Point_9: "KING.CC.A1CO2" - Return Carbon Dioxide
- Point_13: "KING.CC.A1DP" - Duct Pressure Point
- Point_23: "KING.CC.A1MNRT" - Min Room Temp

But we should also look at:
- Point_ 10: "KING.CC.A1DAY" - Air Handling Unit Supply Fan
    - Note: This is the only categorical feature. But it could be interesting to look at. If value is ON that means the building is occupied.

This makes 11 features that are found to be most correlated with Hourly Energy Consumption, and should be good predictors. Now to take a closer look at them.

In [57]:
df = pd.read_csv('model_data_v1.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [58]:
df_feat = df[['Hourly Energy Cons (kWh)', 'Point_34', 'Point_17', 'Point_37', 'Point_21', 'Point_194',
              'Point_26', 'Point_22', 'Point_9', 'Point_13', 'Point_23', 'Point_10']].copy()

In [59]:
df_feat

Unnamed: 0,Hourly Energy Cons (kWh),Point_34,Point_17,Point_37,Point_21,Point_194,Point_26,Point_22,Point_9,Point_13,Point_23,Point_10
0,15.0,17.8,513,18.9,15.5,17.4,21.3,1.7,413,18,18.0,OFF
1,15.0,17.8,513,16.9,15.5,16.8,20.7,0.5,413,18,18.0,OFF
2,10.0,17.8,513,16.9,13.5,16.3,20.7,0.5,413,18,17.4,OFF
3,15.0,17.8,513,16.9,13.5,16.3,20.2,0.5,413,18,17.4,OFF
4,15.0,17.8,513,16.9,13.5,16.3,20.2,0.5,413,18,16.9,ON
...,...,...,...,...,...,...,...,...,...,...,...,...
8754,10.0,24.2,218,24.7,26.0,26.4,25.1,5.2,397,24,22.2,OFF
8755,15.0,24.2,218,24.7,28.0,25.9,24.6,5.2,397,24,22.2,OFF
8756,15.0,24.2,218,24.7,28.0,25.3,24.6,5.2,397,24,22.2,OFF
8757,25.0,24.2,218,24.7,26.0,24.2,24.6,5.2,397,24,22.2,OFF


In [60]:
df_feat.dtypes

Hourly Energy Cons (kWh)    float64
Point_34                    float64
Point_17                      int64
Point_37                    float64
Point_21                    float64
Point_194                   float64
Point_26                    float64
Point_22                    float64
Point_9                       int64
Point_13                      int64
Point_23                    float64
Point_10                     object
dtype: object

In [61]:
df_feat.isnull().sum()

Hourly Energy Cons (kWh)    0
Point_34                    0
Point_17                    0
Point_37                    0
Point_21                    0
Point_194                   0
Point_26                    0
Point_22                    0
Point_9                     0
Point_13                    0
Point_23                    0
Point_10                    0
dtype: int64

In [62]:
df_feat.describe()

Unnamed: 0,Hourly Energy Cons (kWh),Point_34,Point_17,Point_37,Point_21,Point_194,Point_26,Point_22,Point_9,Point_13,Point_23
count,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0
mean,24.682612,21.502398,1876.525973,21.093675,20.189999,19.061971,23.206325,-0.267439,446.183811,114.315447,19.323781
std,12.692877,5.174269,1887.45966,4.203152,4.000844,3.800795,1.905135,3.277519,45.515758,126.059888,2.38543
min,0.0,10.0,0.0,9.9,11.3,13.1,19.3,-9.1,396.0,8.0,14.9
25%,15.0,17.1,334.0,17.9,17.3,15.4,21.7,-2.7,410.0,15.0,17.1
50%,20.0,20.7,651.0,20.6,20.1,18.9,23.7,-1.5,447.0,22.0,20.0
75%,35.0,26.0,3565.0,24.0,22.0,21.1,24.4,2.6,477.0,210.0,21.3
max,75.0,32.3,7474.0,32.2,39.4,33.5,34.2,10.1,729.0,478.0,26.2


In [63]:
df_feat.Point_10.unique()

array(['OFF', 'ON'], dtype=object)

# Modeling

In [64]:
y = df_feat['Hourly Energy Cons (kWh)']
X = df_feat.drop(['Hourly Energy Cons (kWh)'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [65]:
print(X_train.shape)
print(X_test.shape)

(6131, 11)
(2628, 11)


In [66]:
# separate categorical columns and numerical columns
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [67]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())
                                        ])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))
                                        ])
    
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
base_model = RandomForestRegressor()
gb_reg = GradientBoostingRegressor(random_state=0)
xgboost = xgb.XGBRegressor()
lgboost = lgb.LGBMRegressor(num_leaves=31, n_estimators=100, max_depth=-1,
                            learning_rate=0.1, random_state=3832, min_child_samples=20,
                            min_child_weight=0.001, min_split_gain=0.0, reg_alpha=0, reg_lambda=0,
                            subsample=1.0, colsample_bytree=1.0)

In [68]:
# Display HTML representation in a jupyter context
from sklearn import set_config
set_config(display='diagram')

## Random Forest

In [69]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', base_model)])

pipeline.fit(X_train, y_train)

In [70]:
base_preds = pipeline.predict(X_test)

In [71]:
print(pipeline.score(X_test,y_test))
print(r2_score(y_test, base_preds))
print(mean_absolute_error(y_test, base_preds))

0.8792000343829871
0.8792000343829871
3.1650859179189657


In [72]:
df_scores_comp = pd.DataFrame({'Actual':y_test, 'Predicted':base_preds})
df_scores_comp

Unnamed: 0,Actual,Predicted
5485,40.0,44.000000
1940,25.0,21.050000
4033,15.0,17.420476
1294,15.0,16.650000
2797,35.0,35.200000
...,...,...
5932,15.0,15.025000
2860,15.0,15.075000
4397,25.0,24.150000
7849,10.0,14.733333


In [73]:
importances = pipeline.steps[1][1].feature_importances_

In [74]:
importances

array([0.05433243, 0.64544846, 0.02447087, 0.02283831, 0.05575828,
       0.02638714, 0.01945772, 0.03032405, 0.01704436, 0.01807205,
       0.04768459, 0.03818174])

In [75]:
# most important feature
X_train.columns[1]

'Point_17'

## Gradient Boost

In [76]:
gb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', gb_reg)])

gb_pipeline.fit(X_train, y_train)

In [77]:
gb_preds = gb_pipeline.predict(X_test)

In [78]:
print(gb_pipeline.score(X_test,y_test))
print(r2_score(y_test, gb_preds))
print(mean_absolute_error(y_test, gb_preds))

0.8578936923738337
0.8578936923738337
3.446681262695651


## XG Boost

In [79]:
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgboost)])

xgb_pipeline.fit(X_train, y_train)

In [80]:
xgb_preds = xgb_pipeline.predict(X_test)

In [81]:
print(xgb_pipeline.score(X_test,y_test))
print(r2_score(y_test, xgb_preds))
print(mean_absolute_error(y_test, xgb_preds))

0.8764677172071248
0.8764677172071248
3.2584782357992466


## Light GBM

In [82]:
lgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', lgboost)])

lgb_pipeline.fit(X_train, y_train)

In [83]:
lgb_preds = lgb_pipeline.predict(X_test)

In [84]:
print(lgb_pipeline.score(X_test,y_test))
print(r2_score(y_test, lgb_preds))
print(mean_absolute_error(y_test, lgb_preds))

0.8818669594320333
0.8818669594320333
3.132556688764678


In [85]:
# save model for deployment
import pickle

filename = 'lgb_model.sav'
pickle.dump(lgb_pipeline, open(filename, 'wb'))

In [None]:
# to load model
#loaded_model = pickle.load(open(filename, 'wb'))