In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
import os
import pathlib

# Load Data

In [335]:
# Create a list to add dataframes
Bldg = []

# Read all building data and append to Bldg list
for path in pathlib.Path("/Users/alialyakoob/Desktop/ASU/RA/Data/Model_Data/microclimate_model/Combined/dataset1").iterdir():
        if path.is_file():
            # Perform any processing here for chosen date file
            current_file = pd.read_csv(path)
            current_file = current_file.drop(columns=['Unnamed: 0'])
            Bldg.append(current_file)
            
# Create Month, Hour, and Minute column for all dataframes in list
for i in range(len(Bldg)):
    Bldg[i].Date_Time = pd.to_datetime(Bldg[i].Date_Time)
    Bldg[i]['Month_num'] = Bldg[i].Date_Time.dt.month
    Bldg[i]['Hour_num'] = Bldg[i].Date_Time.dt.hour
    Bldg[i]['Minute_num'] = Bldg[i].Date_Time.dt.minute

# Show names of buildings
for i in range(len(Bldg)):
    print(i, Bldg[i]['bldgname'][0])

bldgnum = int(input("Enter the number of the building from the list above: "))

Bldg[bldgnum]

0 Noble Library
1 Biodesign B
2 Biodesign C
3 Biodesign A
4 Psychology
5 Goldwater
6 Schwada COB
7 ISTB 2
8 Bulldog Hall
9 ISTB 4
10 Psychology North
Enter the number of the building from the list above: 2


Unnamed: 0,bldgname,Air Temp,Rel Hum,KW,CHWTON,HTmmBTU,Date_Time,Month_num,Hour_num,Minute_num
0,Biodesign C,24.605922,19.793904,,,,2018-05-16 05:00:00,5,5,0
1,Biodesign C,24.686428,21.910461,,,,2018-05-16 05:15:00,5,5,15
2,Biodesign C,24.507771,21.755324,,,,2018-05-16 05:30:00,5,5,30
3,Biodesign C,24.371680,21.839728,,,,2018-05-16 05:45:00,5,5,45
4,Biodesign C,24.236608,22.134633,,,,2018-05-16 06:00:00,5,6,0
...,...,...,...,...,...,...,...,...,...,...
644,Biodesign C,32.942282,22.154968,480.56,501.36,0.66,2018-09-29 23:00:00,9,23,0
645,Biodesign C,32.807248,22.054812,472.37,493.34,0.35,2018-09-29 23:15:00,9,23,15
646,Biodesign C,32.640789,22.303796,479.67,531.43,0.62,2018-09-29 23:30:00,9,23,30
647,Biodesign C,32.448003,22.678191,476.77,533.99,0.36,2018-09-29 23:45:00,9,23,45


# Select X and y and split dataset into Train-Test

In [360]:
# drop na values if in dataframe
if (Bldg[bldgnum].isnull().values.any() == True):
    Bldg[bldgnum] = Bldg[bldgnum].dropna()

# split data into X and y
X = Bldg[bldgnum].drop(columns=['bldgname','HTmmBTU', 'Date_Time', 'CHWTON', 'KW', 'Minute_num'])
y = Bldg[bldgnum]['CHWTON']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

Check Data

In [361]:
X

Unnamed: 0,Air Temp,Rel Hum,Month_num,Hour_num
264,28.670944,19.418818,6,5
265,28.185755,21.860167,6,5
266,28.079205,21.992603,6,5
267,27.993262,22.311683,6,5
268,27.867049,23.126297,6,6
...,...,...,...,...
644,32.942282,22.154968,9,23
645,32.807248,22.054812,9,23
646,32.640789,22.303796,9,23
647,32.448003,22.678191,9,23


In [362]:
y

264    310.23
265    306.25
266    295.40
267    308.41
268    301.49
        ...  
644    501.36
645    493.34
646    531.43
647    533.99
648    530.34
Name: CHWTON, Length: 385, dtype: float64

In [363]:
X_train

Unnamed: 0,Air Temp,Rel Hum,Month_num,Hour_num
608,35.630773,18.651143,9,14
264,28.670944,19.418818,6,5
372,38.459445,11.622705,6,12
639,33.585401,21.946975,9,21
569,35.550940,20.031353,9,23
...,...,...,...,...
595,32.795727,24.769630,9,10
482,37.164221,11.337054,6,21
487,36.183633,12.558675,6,22
535,38.550821,16.807547,9,15


In [364]:
y_train

608    461.94
264    310.23
372    256.10
639    530.70
569    270.57
        ...  
595    552.77
482    390.51
487    374.60
535    379.21
619    465.01
Name: CHWTON, Length: 308, dtype: float64

In [365]:
X_test

Unnamed: 0,Air Temp,Rel Hum,Month_num,Hour_num
273,29.152031,24.664716,6,7
436,34.805661,18.968586,6,9
545,38.902738,15.695576,9,17
573,28.160457,29.304029,9,5
575,27.737174,30.460513,9,5
...,...,...,...,...
550,38.314628,15.990525,9,18
588,29.566495,30.431019,9,9
493,35.385174,13.135369,6,23
521,36.228588,21.650238,9,11


In [366]:
y_test

273    336.54
436    350.56
545    433.36
573    247.79
575    296.45
        ...  
550    389.61
588    401.57
493    356.41
521    592.64
533    327.35
Name: CHWTON, Length: 77, dtype: float64

# Random Forest without tuning

## On One Building

In [371]:
# Set up model. Number of trees 100
base_RF = RandomForestRegressor(n_estimators = 100, random_state = 42)

# Train data
base_RF.fit(X_train, y_train)

# Get prediction
y_pred = base_RF.predict(X_test)
ModelPred = pd.DataFrame({'Actual CHWTON':y_test, 'Predicted CHWTON':y_pred})
ModelPred = ModelPred.sort_index()
print(ModelPred)

score = base_RF.score(X_test, y_test)
rf_score = pd.DataFrame({"bldgname": Bldg[bldgnum]['bldgname'].unique()[0], "test score (on 20% of data)": [score]})
rf_score

     Actual CHWTON  Predicted CHWTON
271         320.23          302.6939
273         336.54          300.9109
274         336.77          302.7359
277         337.90          300.5337
278         348.01          295.2895
..             ...               ...
624         458.77          433.5813
632         464.12          364.3259
643         507.17          516.2496
645         493.34          513.6056
648         530.34          493.2013

[77 rows x 2 columns]


Unnamed: 0,bldgname,test score (on 20% of data)
0,Biodesign C,0.667921


Save scores

In [300]:
### Create dataframe and add building names ###
rf_bld_scores = pd.DataFrame(columns = {"bldgname", "test score (on 20% of data)"})
for i in range(len(Bldg)):
    rf_bld_scores = rf_bld_scores.append({'bldgname': Bldg[i]['bldgname'].unique()[0]}, ignore_index=True)
rf_bld_scores.fillna(5)

### Append scores to all buildings ###

for i in range(len(Bldg)):
    
    # drop na values if in dataframe
    if (Bldg[i].isnull().values.any() == True):
        Bldg[i] = Bldg[i].dropna()
        
    # split data into X and y
    X = Bldg[i].drop(columns=['bldgname','HTmmBTU', 'Date_Time', 'CHWTON', 'KW', 'Minute_num'])
    y = Bldg[i]['CHWTON']
    
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)
    
    # Set up model. Number of trees 100
    base_RF = RandomForestRegressor(n_estimators = 100, random_state = 42)

    # Train data
    base_RF.fit(X_train, y_train)

    # Get prediction
    y_pred = base_RF.predict(X_test)
    ModelPred = pd.DataFrame({'Actual CHWTON':y_test, 'Predicted CHWTON':y_pred})
    ModelPred = ModelPred.sort_index()
    
    # Save scores
    score = base_RF.score(X_test, y_test)
    rf_bld_scores['test score (on 20% of data)'][i] = score

In [301]:
rf_bld_scores

Unnamed: 0,bldgname,test score (on 20% of data)
0,Noble Library,0.229086
1,Biodesign B,0.964331
2,Biodesign C,0.667921
3,Biodesign A,0.946641
4,Psychology,0.886917
5,Goldwater,0.97376
6,Schwada COB,0.883281
7,ISTB 2,0.948238
8,Bulldog Hall,0.615321
9,ISTB 4,0.961422
