### Import libraries and read in file

In [152]:
# import libs
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# models and stuff
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

In [153]:
# read in file and define some useful column groupings
elec_w_weather_path = '/users/ianmyjer/desktop/disagg/electric_data_with_weather.csv'
df = pd.read_csv(elec_w_weather_path,delimiter=',',header=0,index_col=0,parse_dates=True)

In [154]:
# define useful column groupings
power_cols = ['north_br','south_br','basement','dryer','washer',
'dining_room','dishwasher','workbench','security_system',
'refrigerator','furnace_fan','garage','heat_pump',
'dhw_heater','office','outside_plug','entertainment','utility_room','unmetered','oven']
weather_cols = ['Temp (C)', 'Dew Point Temp (C)', 'Rel Hum (%)', 
'Wind Spd (km/h)','Stn Press (kPa)']

## Features

In [155]:
def feature_creation(df):
    # starting features
    feature_cols = ['main_house_total','Temp (C)','Rel Hum (%)']
    
    # time-based features
    # only hour ever seemed to matter
#     df['dow'] = df.index.weekday
#     feature_cols.append('dow')
#     df['weekday'] = (df.index.weekday < 5)*1
#     df['business_hours'] = (df.index.weekday < 5)&(df.index.hour>=9)&(df.index.hour<=17)*1
    df['hour'] = df.index.hour
    feature_cols.append('hour')


    # season dummies
    # pretty much never important
#     df['winter'] = (df.index.month<=3)*1
#     df['spring'] = (df.index.month>3)&(df.index.month<=6)*1
#     df['summer'] = (df.index.month>6)&(df.index.month<=9)*1
#     df['fall'] = (df.index.month>9)*1

#     main house power percentage change and diff
#     pretty much never important
#     for interval in [1,3,5,9,15]:
#         df['mht_pct_'+str(interval)] = df['main_house_total'].pct_change(freq='1Min').fillna(0).replace(np.inf,1000)
#         df['mht_diff_'+str(interval)] = df['main_house_total'].diff(periods=1).fillna(0)
#         feature_cols.append('mht_pct_'+str(interval))
#         feature_cols.append('mht_diff_'+str(interval))
    
    # rolling stats
    # smaller values seems to do better, which makes sense
    for time_val in [5,10,15,30,60]:
        df['rolling_sum_'+str(time_val)] = df['main_house_total'].rolling(time_val).sum().fillna(0)
        df['rolling_mean_'+str(time_val)] = df['main_house_total'].rolling(time_val).mean().fillna(0)
        df['rolling_max_'+str(time_val)]  = df['main_house_total'].rolling(time_val).max().fillna(0)
        df['rolling_min_'+str(time_val)]  = df['main_house_total'].rolling(time_val).min().fillna(0)
        df['rolling_median_'+str(time_val)]  = df['main_house_total'].rolling(time_val).median().fillna(0)
        feature_cols.append('rolling_sum_'+str(time_val))
        feature_cols.append('rolling_mean_'+str(time_val))
        feature_cols.append('rolling_max_'+str(time_val))
        feature_cols.append('rolling_min_'+str(time_val))
        feature_cols.append('rolling_median_'+str(time_val))

    return feature_cols

## Modelling Functions

In [156]:
def tts(X,y,percent=0.66):
    train_size = int(len(X) * percent)
    X_train, X_test = X[0:train_size], X[train_size:len(X)]
    y_train, y_test = y[0:train_size], y[train_size:len(X)]
    return X_train, X_test, y_train, y_test

In [157]:
def tts_small(X,y,first_split=0.33,second_split=0.66):
    train_size = int(len(X) * first_split)
    test_size = int(len(X) * second_split)
    X_train, X_test = X[0:train_size], X[train_size:test_size]
    y_train, y_test = y[0:train_size], y[train_size:test_size]
    return X_train, X_test, y_train, y_test

In [158]:
def model_chooser(model_type,X_train,X_test,y_train,n_estimators=10,max_features='auto'):
    if model_type == 'random_forest':
        model = RandomForestClassifier(random_state=42,n_estimators=n_estimators,max_features=max_features)
    elif model_type == 'logistic_regression':
        model = LogisticRegression()
    elif model_type == 'decision_tree':
        model = DecisionTreeClassifier(random_state=42,n_estimators=n_estimators,max_features=max_features)
    elif model_type == 'extra_trees':
        model = ExtraTreesClassifier(random_state=42,n_estimators=n_estimators,max_features=max_features)
    elif model_type == 'ada_boost':
        model = AdaBoostClassifier(random_state=42,n_estimators=n_estimators)
    elif model_type == 'gradient_boost':
        model = GradientBoostingClassifier(random_state=42,n_estimators=n_estimators,max_features=max_features)
    else:
        model = None

    # fit and predict on model
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]
    
    return model, y_pred, y_pred_proba

In [159]:
def print_accuracies(lbl,y_test,y_pred,y_pred_proba):
    print(lbl)
    # Predict appliances are always OFF
    y_null = np.zeros(len(y_test))
    print('Null Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_null)))

    # total accuracy
    print('Total Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('Log Loss: {:0.4f}'.format(metrics.log_loss(y_test,y_pred_proba)))

    # binary accuracy
    print('Accuracy Score 1s: {:0.4f}'.format(metrics.accuracy_score(y_test[y_test==1], y_pred[y_test==1])))
    print('Accuracy Score 0s: {:0.4f}'.format(metrics.accuracy_score(y_test[y_test==0], y_pred[y_test==0])))
    
    # confusion matrix
    print(metrics.confusion_matrix(y_test,y_pred))

In [160]:
def save_accuracies(lbl,model_type,y_test,y_pred,y_pred_proba):
    dct = {}
    y_null = np.zeros(len(y_test))
    dct[model_type+'_null_accuracy'] = metrics.accuracy_score(y_test, y_null)
    dct[model_type+'_total_accuracy'] = metrics.accuracy_score(y_test, y_pred)
    dct[model_type+'_log_loss'] = metrics.log_loss(y_test,y_pred_proba)
    dct[model_type+'_accuracy_1'] = metrics.accuracy_score(y_test[y_test==1], y_pred[y_test==1])
    dct[model_type+'_accuracy_0'] = metrics.accuracy_score(y_test[y_test==0], y_pred[y_test==0])
    return dct                                  

## Find and map max appliance

In [161]:
df['max_power'] = df[power_cols].max(axis=1)

In [162]:
df['col_max_row'] = df[power_cols].idxmax(axis=1)

In [163]:
df['col_max_row_remapped'] = df['col_max_row'].map(dict(zip(power_cols,range(len(power_cols)+1))))

In [164]:
dict(zip(power_cols,range(len(power_cols)+1)))

{'basement': 2,
 'dhw_heater': 13,
 'dining_room': 5,
 'dishwasher': 6,
 'dryer': 3,
 'entertainment': 16,
 'furnace_fan': 10,
 'garage': 11,
 'heat_pump': 12,
 'north_br': 0,
 'office': 14,
 'outside_plug': 15,
 'oven': 19,
 'refrigerator': 9,
 'security_system': 8,
 'south_br': 1,
 'unmetered': 18,
 'utility_room': 17,
 'washer': 4,
 'workbench': 7}

## Attempt to classify max appliance

In [None]:
feature_cols = feature_creation(df)
lbl = 'col_max_row_remapped'
model_type = 'random_forest'

X = df[feature_cols]
y = df[lbl]

X_train, X_test, y_train, y_test = tts(X,y)

model, y_pred, y_pred_proba = model_chooser(model_type,X_train,X_test,y_train,n_estimators=100,max_features='auto')

In [95]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [96]:
pd.DataFrame(list(zip(feature_cols,model.feature_importances_))).sort_values(by=1,ascending=False)

Unnamed: 0,0,1
0,main_house_total,0.535675
38,rolling_min_60,0.057046
1,Temp (C),0.035562
3,hour,0.03518
33,rolling_min_30,0.032073
18,rolling_min_5,0.022147
2,Rel Hum (%),0.021331
37,rolling_max_60,0.018819
34,rolling_median_30,0.017405
28,rolling_min_15,0.015998


In [112]:
# total accuracy
print('Total Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_pred)))
print(metrics.classification_report(y_test, y_pred))
metrics.confusion_matrix(y_test,y_pred)


Total Accuracy Score: 0.7907
Hamming Loss: 0.2093
F1 Score: 0.7907
Precision Score: 0.7907
Recall Score: 0.7907
Jaccard Similarity Score: 0.7907


array([[     0,      0,      1,      1,      0,      2,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             8,      0],
       [     0,    673,      0,      0,      0,      0,      0,      0,
          1088,   1929,      0,      0,      0,      1,      1,      0,
          3464,      0],
       [     0,      0,   9799,     32,     18,     97,      0,      0,
            14,      2,      0,      3,      0,      0,      2,      0,
          9716,      1],
       [     0,      0,     23,   4641,      1,      0,      0,      0,
             0,      0,      0,      2,      0,      0,      1,      0,
           237,    204],
       [     0,      0,     88,     10,    109,     82,      0,      0,
           167,     77,      0,      1,      0,      0,      4,      0,
          1969,      0],
       [     0,      0,    242,      0,     24,   2772,      0,      0,
            67,      0,      0,     51,      0,      0,      1,      0,
          2

In [113]:
feature_cols = feature_creation(df)
lbl = 'col_max_row_remapped'
model_type = 'extra_trees'

X = df[feature_cols]
y = df[lbl]

X_train, X_test, y_train, y_test = tts(X,y)

model, y_pred, y_pred_proba = model_chooser(model_type,X_train,X_test,y_train,n_estimators=100,max_features='auto')

In [114]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [115]:
# total accuracy
print('Total Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('Hamming Loss: {:0.4f}'.format(metrics.hamming_loss(y_test, y_pred)))
print('F1 Score: {:0.4f}'.format(metrics.f1_score(y_test, y_pred, average='micro')))
print('Precision Score: {:0.4f}'.format(metrics.precision_score(y_test, y_pred, average='micro')))
print('Recall Score: {:0.4f}'.format(metrics.recall_score(y_test, y_pred, average='micro')))
print('Jaccard Similarity Score: {:0.4f}'.format(metrics.jaccard_similarity_score(y_test, y_pred)))
metrics.confusion_matrix(y_test,y_pred)
# print('Log Loss: {:0.4f}'.format(metrics.log_loss(y_test,y_pred_proba)))

Total Accuracy Score: 0.7893
Hamming Loss: 0.2107
F1 Score: 0.7893
Precision Score: 0.7893
Recall Score: 0.7893
Jaccard Similarity Score: 0.7893


array([[     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,     12,      0],
       [     0,    186,      0,      1,      0,      0,      0,      0,
           990,   2075,      0,      0,      0,      0,   3904,      0],
       [     0,      0,   8888,     20,      4,     41,      0,      0,
             6,      0,      0,      0,      0,      0,  10725,      0],
       [     0,      0,      6,   4914,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,    189,      0],
       [     0,      0,     54,     16,     49,     37,      0,      0,
            65,     93,      0,      1,      0,      0,   2192,      0],
       [     0,      0,    189,      0,      4,   2830,      0,      0,
            34,     31,      0,     36,      0,      0,   2664,      0],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,   

In [116]:
feature_cols = feature_creation(df)
lbl = 'col_max_row_remapped'
model_type = 'ada_boost'

X = df[feature_cols]
y = df[lbl]

X_train, X_test, y_train, y_test = tts(X,y)

model, y_pred, y_pred_proba = model_chooser(model_type,X_train,X_test,y_train,n_estimators=100,max_features='auto')

In [117]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [118]:
# total accuracy
print('Total Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('Hamming Loss: {:0.4f}'.format(metrics.hamming_loss(y_test, y_pred)))
print('F1 Score: {:0.4f}'.format(metrics.f1_score(y_test, y_pred, average='micro')))
print('Precision Score: {:0.4f}'.format(metrics.precision_score(y_test, y_pred, average='micro')))
print('Recall Score: {:0.4f}'.format(metrics.recall_score(y_test, y_pred, average='micro')))
print('Jaccard Similarity Score: {:0.4f}'.format(metrics.jaccard_similarity_score(y_test, y_pred)))
metrics.confusion_matrix(y_test,y_pred)
# print('Log Loss: {:0.4f}'.format(metrics.log_loss(y_test,y_pred_proba)))

Total Accuracy Score: 0.6422
Hamming Loss: 0.3578
F1 Score: 0.6422
Precision Score: 0.6422
Recall Score: 0.6422
Jaccard Similarity Score: 0.6422


array([[     0,      0,      0,      4,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      8,      0],
       [     0,      1,      0,     12,      0,      0,      0,      0,
             0,     97,      0,     48,      0,     52,   6942,      4],
       [     0,      0,      0,    258,      0,      5,      0,      0,
             0,      0,      0,    137,      0,    203,  18853,    228],
       [     0,      0,      0,   5103,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      1,      3,      2],
       [     0,      0,      0,     44,      0,      0,      0,      0,
             0,     15,      0,     29,      0,     94,   2299,     26],
       [     0,      0,      0,    137,      0,      0,      0,      0,
             0,      0,      0,    291,      0,     53,   5078,    229],
       [     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,   

In [119]:
feature_cols = feature_creation(df)
lbl = 'col_max_row_remapped'
model_type = 'gradient_boost'

X = df[feature_cols]
y = df[lbl]

X_train, X_test, y_train, y_test = tts(X,y)

model, y_pred, y_pred_proba = model_chooser(model_type,X_train,X_test,y_train,n_estimators=100,max_features='auto')

In [120]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [121]:
# total accuracy
print('Total Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('Hamming Loss: {:0.4f}'.format(metrics.hamming_loss(y_test, y_pred)))
print('F1 Score: {:0.4f}'.format(metrics.f1_score(y_test, y_pred, average='micro')))
print('Precision Score: {:0.4f}'.format(metrics.precision_score(y_test, y_pred, average='micro')))
print('Recall Score: {:0.4f}'.format(metrics.recall_score(y_test, y_pred, average='micro')))
print('Jaccard Similarity Score: {:0.4f}'.format(metrics.jaccard_similarity_score(y_test, y_pred)))
metrics.confusion_matrix(y_test,y_pred)
# print('Log Loss: {:0.4f}'.format(metrics.log_loss(y_test,y_pred_proba)))

Total Accuracy Score: 0.7576
Hamming Loss: 0.2424
F1 Score: 0.7576
Precision Score: 0.7576
Recall Score: 0.7576
Jaccard Similarity Score: 0.7576


array([[     0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,     12,
             0],
       [     0,    430,      0,      0,      2,      0,      0,      0,
          1101,   2327,      0,      0,      0,      0,      0,   3296,
             0],
       [     3,      0,   9094,     25,      0,     67,      0,      0,
             3,      0,      0,      2,      0,      1,      0,  10489,
             0],
       [     3,      0,     29,   4860,      0,      2,      0,      0,
             0,      0,      0,      0,      0,      0,      0,    151,
            64],
       [     0,      0,     54,     19,     63,     55,      0,      0,
           166,     83,      0,      0,      1,      8,      0,   2058,
             0],
       [     0,      0,    171,      0,     34,   2210,      1,      0,
            82,      0,      2,     75,      0,      0,      0,   3213,
             0],
       [     0,      0,      0,   

In [126]:
remap = {'dishwasher':1,'dryer':1,'washer':1,'oven':1,
         'dhw_heater':3,'refrigerator':3,
         'furnace_fan':4,'heat_pump':4,
         'basement':2,'dining_room':2,'entertainment':2,'garage':2,'south_br':2,'north_br':2,'security_system':2,
        'utility_room':2,'workbench':2,'outside_plug':2,'unmetered':2,'office':2}

In [127]:
df['col_max_row_categorized'] = df['col_max_row'].map(remap)

In [135]:
feature_cols = feature_creation(df)
lbl = 'col_max_row_categorized'
model_type = 'random_forest'

X = df[feature_cols]
y = df[lbl]

X_train, X_test, y_train, y_test = tts(X,y)

model, y_pred, y_pred_proba = model_chooser(model_type,X_train,X_test,y_train,n_estimators=100,max_features='auto')

In [136]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [147]:
# total accuracy
print('Total Accuracy Score: {:0.4f}'.format(metrics.accuracy_score(y_test, y_pred)))
print(metrics.classification_report(y_test, y_pred))
metrics.confusion_matrix(y_test,y_pred)
# print('Log Loss: {:0.4f}'.format(metrics.log_loss(y_test,y_pred_proba)))

Total Accuracy Score: 0.8550
Hamming Loss: 0.1450
F1 Score: 0.8550
Precision Score: 0.8550
Recall Score: 0.8550
Jaccard Similarity Score: 0.8550


array([[  8763,   5591,    149,     97],
       [   636, 150935,   7844,   6722],
       [    38,  12839,  31511,    683],
       [    88,  10938,   6167, 114265]])

In [149]:
print(cr)

             precision    recall  f1-score   support

          1       0.92      0.60      0.73     14600
          2       0.84      0.91      0.87    166137
          3       0.69      0.70      0.69     45071
          4       0.94      0.87      0.90    131458

avg / total       0.86      0.86      0.85    357266



In [138]:
pd.DataFrame(list(zip(feature_cols,model.feature_importances_))).sort_values(by=1,ascending=False)

Unnamed: 0,0,1
0,main_house_total,0.213086
8,rolling_min_5,0.091607
7,rolling_max_5,0.068163
6,rolling_mean_5,0.061255
9,rolling_median_5,0.052528
5,rolling_sum_5,0.049463
13,rolling_min_10,0.047009
28,rolling_min_60,0.03218
12,rolling_max_10,0.032034
23,rolling_min_30,0.030393
