### Import libraries and read in file

In [152]:
# import libs
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# models and stuff
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

In [153]:
# read in file and define some useful column groupings
elec_w_weather_path = '/users/ianmyjer/desktop/disagg/electric_data_with_weather.csv'
df = pd.read_csv(elec_w_weather_path,delimiter=',',header=0,index_col=0,parse_dates=True)

In [154]:
# define useful column groupings
power_cols = ['north_br','south_br','basement','dryer','washer',
'dining_room','dishwasher','workbench','security_system',
'refrigerator','furnace_fan','garage','heat_pump',
'dhw_heater','office','outside_plug','entertainment','utility_room','unmetered','oven']
weather_cols = ['Temp (C)', 'Dew Point Temp (C)', 'Rel Hum (%)', 
'Wind Spd (km/h)','Stn Press (kPa)']

## Features

In [155]:
def feature_creation(df):
    # starting features
    feature_cols = ['main_house_total','Temp (C)','Rel Hum (%)']
    
    # time-based features
    # only hour ever seemed to matter
#     df['dow'] = df.index.weekday
#     feature_cols.append('dow')
#     df['weekday'] = (df.index.weekday < 5)*1
#     df['business_hours'] = (df.index.weekday < 5)&(df.index.hour>=9)&(df.index.hour<=17)*1
    df['hour'] = df.index.hour
    feature_cols.append('hour')


    # season dummies
    # pretty much never important
#     df['winter'] = (df.index.month<=3)*1
#     df['spring'] = (df.index.month>3)&(df.index.month<=6)*1
#     df['summer'] = (df.index.month>6)&(df.index.month<=9)*1
#     df['fall'] = (df.index.month>9)*1

#     main house power percentage change and diff
#     pretty much never important
#     for interval in [1,3,5,9,15]:
#         df['mht_pct_'+str(interval)] = df['main_house_total'].pct_change(freq='1Min').fillna(0).replace(np.inf,1000)
#         df['mht_diff_'+str(interval)] = df['main_house_total'].diff(periods=1).fillna(0)
#         feature_cols.append('mht_pct_'+str(interval))
#         feature_cols.append('mht_diff_'+str(interval))
    
    # rolling stats
    # smaller values seems to do better, which makes sense
    for time_val in [5,10,15,30,60]:
        df['rolling_sum_'+str(time_val)] = df['main_house_total'].rolling(time_val).sum().fillna(0)
        df['rolling_mean_'+str(time_val)] = df['main_house_total'].rolling(time_val).mean().fillna(0)
        df['rolling_max_'+str(time_val)]  = df['main_house_total'].rolling(time_val).max().fillna(0)
        df['rolling_min_'+str(time_val)]  = df['main_house_total'].rolling(time_val).min().fillna(0)
        df['rolling_median_'+str(time_val)]  = df['main_house_total'].rolling(time_val).median().fillna(0)
        feature_cols.append('rolling_sum_'+str(time_val))
        feature_cols.append('rolling_mean_'+str(time_val))
        feature_cols.append('rolling_max_'+str(time_val))
        feature_cols.append('rolling_min_'+str(time_val))
        feature_cols.append('rolling_median_'+str(time_val))

    return feature_cols

## Modelling Functions

In [156]:
def tts(X,y,percent=0.66):
    train_size = int(len(X) * percent)
    X_train, X_test = X[0:train_size], X[train_size:len(X)]
    y_train, y_test = y[0:train_size], y[train_size:len(X)]
    return X_train, X_test, y_train, y_test

In [157]:
def tts_small(X,y,first_split=0.33,second_split=0.66):
    train_size = int(len(X) * first_split)
    test_size = int(len(X) * second_split)
    X_train, X_test = X[0:train_size], X[train_size:test_size]
    y_train, y_test = y[0:train_size], y[train_size:test_size]
    return X_train, X_test, y_train, y_test

In [158]:
def model_chooser(model_type,X_train,X_test,y_train,n_estimators=10,max_features='auto'):
    if model_type == 'random_forest':
        model = RandomForestClassifier(random_state=42,n_estimators=n_estimators,max_features=max_features)
    elif model_type == 'logistic_regression':
        model = LogisticRegression()
    elif model_type == 'decision_tree':
        model = DecisionTreeClassifier(random_state=42,n_estimators=n_estimators,max_features=max_features)
    elif model_type == 'extra_trees':
        model = ExtraTreesClassifier(random_state=42,n_estimators=n_estimators,max_features=max_features)
    elif model_type == 'ada_boost':
        model = AdaBoostClassifier(random_state=42,n_estimators=n_estimators)
    elif model_type == 'gradient_boost':
        model = GradientBoostingClassifier(random_state=42,n_estimators=n_estimators,max_features=max_features)
    else:
        model = None

    # fit and predict on model
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]
    
    return model, y_pred, y_pred_proba

In [159]:
def print_accuracies(lbl,y_test,y_pred,y_pred_proba):
    print(lbl)
    # Predict appliances are always OFF
    y_null = np.zeros(len(y_test))
    print('Null Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_null)))

    # total accuracy
    print('Total Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('Log Loss: {:0.4f}'.format(metrics.log_loss(y_test,y_pred_proba)))

    # binary accuracy
    print('Accuracy Score 1s: {:0.4f}'.format(metrics.accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy Score 0s: {:0.4f}'.format(metrics.accuracy_score(y_test[y_test==0], y_pred[y_test==0])))
    
    # confusion matrix
    print(metrics.confusion_matrix(y_test,y_pred))

In [160]:
def save_accuracies(lbl,model_type,y_test,y_pred,y_pred_proba):
    dct = {}
    y_null = np.zeros(len(y_test))
    dct[model_type+'_null_accuracy'] = metrics.accuracy_score(y_test, y_null)
    dct[model_type+'_total_accuracy'] = metrics.accuracy_score(y_test, y_pred)
    dct[model_type+'_log_loss'] = metrics.log_loss(y_test,y_pred_proba)
    dct[model_type+'_accuracy_1'] = metrics.accuracy_score(y_test[y_test==1], y_pred[y_test==1])
    dct[model_type+'_accuracy_0'] = metrics.accuracy_score(y_test[y_test==0], y_pred[y_test==0])
    return dct                                  

## Find and map max appliance

In [161]:
df['max_power'] = df[power_cols].max(axis=1)

In [162]:
df['col_max_row'] = df[power_cols].idxmax(axis=1)

In [163]:
df['col_max_row_remapped'] = df['col_max_row'].map(dict(zip(power_cols,range(len(power_cols)+1))))

In [164]:
dict(zip(power_cols,range(len(power_cols)+1)))

{'basement': 2,
 'dhw_heater': 13,
 'dining_room': 5,
 'dishwasher': 6,
 'dryer': 3,
 'entertainment': 16,
 'furnace_fan': 10,
 'garage': 11,
 'heat_pump': 12,
 'north_br': 0,
 'office': 14,
 'outside_plug': 15,
 'oven': 19,
 'refrigerator': 9,
 'security_system': 8,
 'south_br': 1,
 'unmetered': 18,
 'utility_room': 17,
 'washer': 4,
 'workbench': 7}

## Attempt to classify max appliance

In [165]:
feature_cols = feature_creation(df)
lbl = 'col_max_row_remapped'
model_type = 'random_forest'

X = df[feature_cols]
y = df[lbl]

X_train, X_test, y_train, y_test = tts(X,y)

model, y_pred, y_pred_proba = model_chooser(model_type,X_train,X_test,y_train,n_estimators=100,max_features='auto')

In [95]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [173]:
pd.DataFrame(list(zip(feature_cols,model.feature_importances_))).sort_values(by=1,ascending=False)

Unnamed: 0,0,1
0,main_house_total,0.166982
7,rolling_min_5,0.089118
6,rolling_max_5,0.074782
5,rolling_mean_5,0.058616
8,rolling_median_5,0.058543
4,rolling_sum_5,0.04992
12,rolling_min_10,0.03769
27,rolling_min_60,0.033354
17,rolling_min_15,0.031572
22,rolling_min_30,0.030181


In [172]:
# total accuracy
print('Total Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_pred)))
print(metrics.classification_report(y_test, y_pred))
metrics.confusion_matrix(y_test,y_pred)

Total Accuracy Score: 0.7961
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.83      0.03      0.06      7156
          2       0.67      0.46      0.55     19684
          3       0.94      0.96      0.95      5109
          4       0.70      0.03      0.05      2507
          6       0.88      0.53      0.66      5788
          7       0.00      0.00      0.00         7
          8       0.00      0.00      0.00         1
          9       0.67      0.71      0.69     45071
         10       0.93      0.85      0.89    107894
         11       0.00      0.00      0.00        27
         12       0.92      0.97      0.94     23564
         14       0.00      0.00      0.00       197
         16       0.00      0.00      0.00      3252
         18       0.73      0.88      0.80    135801
         19       0.84      0.48      0.61      1196

avg / total       0.80      0.80      0.78    357266



  'precision', 'predicted', average, warn_for)


array([[     0,      0,      0,      1,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,     11,      0],
       [     0,    226,      0,      1,      0,      0,      0,      0,
          1064,   2099,      0,      0,      0,      0,   3766,      0],
       [     0,      0,   9076,     26,      2,     57,      0,      0,
            17,      0,      0,      1,      0,      0,  10505,      0],
       [     0,      0,     17,   4906,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,    185,      1],
       [     0,      0,     73,     10,     71,     44,      0,      0,
           117,     52,      0,      0,      0,      0,   2140,      0],
       [     0,      0,    201,      0,      6,   3067,      0,      0,
            59,      0,      0,     36,      0,      0,   2419,      0],
       [     0,      0,      1,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,   

In [170]:
[str(x) for x in set(y_pred)]

['1', '2', '3', '4', '6', '9', '10', '12', '14', '16', '18', '19']

## Predict Categorical Max

In [180]:
remap = {'dishwasher':1,'dryer':1,'washer':1,'oven':1,
         'dhw_heater':3,'refrigerator':3,
         'furnace_fan':3,'heat_pump':3,
         'basement':2,'dining_room':2,'entertainment':2,'garage':2,'south_br':2,'north_br':2,'security_system':2,
        'utility_room':2,'workbench':2,'outside_plug':2,'unmetered':2,'office':2}

In [181]:
df['col_max_row_categorized'] = df['col_max_row'].map(remap)

In [182]:
feature_cols = feature_creation(df)
lbl = 'col_max_row_categorized'
model_type = 'random_forest'

X = df[feature_cols]
y = df[lbl]

X_train, X_test, y_train, y_test = tts(X,y)

model, y_pred, y_pred_proba = model_chooser(model_type,X_train,X_test,y_train,n_estimators=100,max_features='auto')

In [183]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [184]:
# total accuracy
print('Total Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('Log Loss: {:0.4f}'.format(metrics.log_loss(y_test,y_pred_proba)))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test,y_pred))


Total Accuracy Score: 0.8730
Log Loss: 0.3535
             precision    recall  f1-score   support

          1       0.92      0.60      0.73     14600
          2       0.86      0.88      0.87    166137
          3       0.88      0.89      0.89    176529

avg / total       0.87      0.87      0.87    357266

[[  8776   5420    404]
 [   632 145390  20115]
 [   143  18646 157740]]


In [138]:
pd.DataFrame(list(zip(feature_cols,model.feature_importances_))).sort_values(by=1,ascending=False)

Unnamed: 0,0,1
0,main_house_total,0.213086
8,rolling_min_5,0.091607
7,rolling_max_5,0.068163
6,rolling_mean_5,0.061255
9,rolling_median_5,0.052528
5,rolling_sum_5,0.049463
13,rolling_min_10,0.047009
28,rolling_min_60,0.03218
12,rolling_max_10,0.032034
23,rolling_min_30,0.030393
