In [34]:
# Based partly on http://mariofilho.com/how-to-predict-multiple-time-series-with-scikit-learn-with-sales-forecasting-example/
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# from rfpimp import *  - not easy to install on cluster

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [35]:
# Functions to use for prediction evaluation

def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))



In [36]:
# Train the model, measure accuracy, and error metrics

def make_model(X, y, X_valid, y_valid, n_estimators,
              max_features, min_samples_leaf, random_state):

    #global rf
    
    rf = RandomForestRegressor(n_estimators=n_estimators,
                           n_jobs=-1,
                           oob_score=True,
                           max_features=max_features, 
                           min_samples_leaf=min_samples_leaf,
                           random_state = random_state)
                           #verbose = 1

    #Inputting train dataset into the model
    rf.fit(X, y)

    return rf

In [37]:
# Using the model to make a prediction & Error Analysis for Random Forest 

def run_model(rf, X_valid):
   
    #n = rfnnodes(rf)
    #h = np.median(rfmaxdepths(rf))
    
    y_pred_rf_long = rf.predict(X_valid)
    
    total_len = y_pred_rf_long.shape[0]
    num_buildings = len(val_data['building_name'].unique())

    y_pred_rf = y_pred_rf_long.reshape((num_buildings, total_len//num_buildings))
   
    mae_valid = mean_absolute_error(y_valid, y_pred)
    rmsle_valid = np.sqrt( mean_squared_error(y_valid, y_pred) )
    r2_score_valid = rf.score(X_valid, y_valid)
    
    oob_valid = rf.oob_score_

    print(f"RF OOB score {rf.oob_score_:.5f}")
    print(f"Validation R^2 {r2_score_valid:.5f}, RMSLE {rmsle_valid:.5f}, MAE {mae_valid:.2f}")
    
    return y_pred_rf,rmsle_valid,mae_valid,r2_score_valid

### Load data

In [38]:
# Setting 'parse_dates' in this case parses both dates and times
# These files are too large to commit so they're uploaded locally under `/exploring_models` but not pushed
train_data = pd.read_csv('weather1_education_train.csv', parse_dates = ['timestamp'])
val_data = pd.read_csv('weather1_education_test.csv', parse_dates = ['timestamp'])

In [10]:
# All building types (original)
X, X_valid = train_data.drop(['electricity', 'building_name', 'primary_space_usage', 'hour', 'year', 'month','weekday', 'date', 'timestamp', 'Unnamed: 0'], axis=1), val_data.drop(['electricity', 'building_name', 'primary_space_usage', 'hour', 'year', 'month','weekday', 'date', 'timestamp', 'Unnamed: 0'], axis=1)
y, y_valid = train_data['electricity'], val_data['electricity']


### Example of making subsets

In [50]:
# Primary school classrooms
train_data_PrimClass = train_data[train_data['primary_space_usage']=="PrimClass"]

val_data_PrimClass = val_data[val_data['primary_space_usage']=="PrimClass"]

print(train_data_PrimClass.shape, val_data_PrimClass.shape)

X = train_data_PrimClass.drop(['electricity', 'building_name', 'primary_space_usage','timestamp', 'Unnamed: 0'], axis=1)
X_valid = val_data_PrimClass.drop(['electricity', 'building_name', 'primary_space_usage', 'timestamp', 'Unnamed: 0'], axis=1)
y, y_valid = train_data_PrimClass['electricity'], val_data_PrimClass['electricity']


(429093, 88) (166383, 88)


In [51]:
# University classrooms
train_data_UnivClass = train_data[train_data['primary_space_usage']=="UnivClass"]

val_data_UnivClass = val_data[val_data['primary_space_usage']=="UnivClass"]

print(train_data_UnivClass.shape, val_data_UnivClass.shape)

X = train_data_UnivClass.drop(['electricity', 'building_name', 'primary_space_usage','timestamp', 'Unnamed: 0'], axis=1)
X_valid = val_data_UnivClass.drop(['electricity', 'building_name', 'primary_space_usage', 'timestamp', 'Unnamed: 0'], axis=1)
y, y_valid = train_data_UnivClass['electricity'], val_data_UnivClass['electricity']


(8757, 88) (0, 88)


In [None]:
# Offices
train_data_Office = train_data[train_data['primary_space_usage']=="Office"]

val_data_Office = val_data[val_data['primary_space_usage']=="Office"]

print(train_data_Office.shape, val_data_Office.shape)

X = train_data_Office.drop(['electricity', 'building_name', 'primary_space_usage','timestamp', 'Unnamed: 0'], axis=1)
X_valid = val_data_Office.drop(['electricity', 'building_name', 'primary_space_usage', 'timestamp', 'Unnamed: 0'], axis=1)
y, y_valid = train_data_Office['electricity'], val_data_Office['electricity']




In [None]:
# UnivDorm
train_data_UnivDorm = train_data[train_data['primary_space_usage']=="UnivDorm"]

val_data_UnivDorm = val_data[val_data['primary_space_usage']=="UnivDorm"]

print(train_data_UnivDorm.shape, val_data_UnivDorm.shape)

X = train_data_UnivDorm.drop(['electricity', 'building_name', 'primary_space_usage','timestamp', 'Unnamed: 0'], axis=1)
X_valid = val_data_UnivDorm.drop(['electricity', 'building_name', 'primary_space_usage', 'timestamp', 'Unnamed: 0'], axis=1)
y, y_valid = train_data_UnivDorm['electricity'], val_data_UniveDorm['electricity']



In [None]:
# UnivLab
train_data_UnivLab = train_data[train_data['primary_space_usage']=="UnivLab"]

val_data_UnivLab = val_data[val_data['primary_space_usage']=="UnivLab"]

print(train_data_UnivLab.shape, val_data_UnivLab.shape)

X = train_data_UnivLab.drop(['electricity', 'building_name', 'primary_space_usage','timestamp', 'Unnamed: 0'], axis=1)
X_valid = val_data_UnivLab.drop(['electricity', 'building_name', 'primary_space_usage', 'timestamp', 'Unnamed: 0'], axis=1)
y, y_valid = train_data_UnivLab['electricity'], val_data_UnivLab['electricity']



In [49]:
train_data.columns

Index(['Unnamed: 0', 'area', 'building_name', 'electricity',
       'primary_space_usage', 'timestamp', 'TemperatureC', 'month', 'year',
       'date', 'hour', 'weekday', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10',
       'month_11', 'month_12', 'date_1', 'date_2', 'date_3', 'date_4',
       'date_5', 'date_6', 'date_7', 'date_8', 'date_9', 'date_10', 'date_11',
       'date_12', 'date_13', 'date_14', 'date_15', 'date_16', 'date_17',
       'date_18', 'date_19', 'date_20', 'date_21', 'date_22', 'date_23',
       'date_24', 'date_25', 'date_26', 'date_27', 'date_28', 'date_29',
       'date_30', 'date_31', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'wkday_0', 'wkday_1', 'wkday_2'

### Check if number of unique buildings matches number of unique building areas

In [40]:
# Output arrays with number of building names, and the number of rows per building
array_buildings, obs_per_building = np.unique(train_data['building_name'], return_counts = True)
array_buildings

array(['PrimClass_Jacqueline', 'PrimClass_Jacquelyn', 'PrimClass_Jaiden',
       'PrimClass_Jake', 'PrimClass_Jamie', 'PrimClass_Jane',
       'PrimClass_Janelle', 'PrimClass_Janice', 'PrimClass_Janie',
       'PrimClass_Janis', 'PrimClass_Janiya', 'PrimClass_Jaqueline',
       'PrimClass_Jarrett', 'PrimClass_Jasmine', 'PrimClass_Jaxson',
       'PrimClass_Jaylin', 'PrimClass_Jazmin', 'PrimClass_Jazmine',
       'PrimClass_Jean', 'PrimClass_Jeanine', 'PrimClass_Jediah',
       'PrimClass_Jeffery', 'PrimClass_Jeffrey', 'PrimClass_Jennie',
       'PrimClass_Jennifer', 'PrimClass_Jeremy', 'PrimClass_Jermaine',
       'PrimClass_Jerome', 'PrimClass_Jesse', 'PrimClass_Jill',
       'PrimClass_Jim', 'PrimClass_Jimmie', 'PrimClass_Joanna',
       'PrimClass_Jocelyn', 'PrimClass_Jodie', 'PrimClass_Joel',
       'PrimClass_Johanna', 'PrimClass_Johnathan', 'PrimClass_Johnathon',
       'PrimClass_Johnnie', 'PrimClass_Jonathon', 'PrimClass_Jose',
       'PrimClass_Josue', 'PrimClass_Juanita', 'Pr

In [41]:
np.unique(obs_per_building)

array([8757], dtype=int64)

In [27]:
obs_per_building.sum()

437850

In [29]:
array_areas, obs_per_area = np.unique(train_data['area'], return_counts = True)
array_areas.shape

(50,)

In [30]:
obs_per_area.sum()

437850

In [20]:
# Characterize feature importance

def calc_feature_importance(rf):
    feature_importances = pd.DataFrame(rf.feature_importances_,
                            index = X.columns,
                            columns=['importance']).sort_values('importance',ascending=False)

    return feature_importances

In [21]:
calc_feature_importance(rf)

Unnamed: 0,importance
area,0.600075
TemperatureC,0.039801
wkday_5,0.033079
wkday_6,0.027211
month_8,0.023619
hour_10,0.023306
hour_9,0.023181
hour_13,0.022964
hour_11,0.022791
hour_12,0.022686
