## **Importing Libraries**

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import datetime
import time
import math
import warnings
from sklearn import preprocessing
warnings.filterwarnings("ignore")
import glob
import xgboost
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import cross_val_score,KFold
from sklearn.model_selection import  train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV   #Perforing grid search
from scipy.stats import skew
from collections import OrderedDict
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [6]:
def read_label():
    label = {}
    for i in range(1, 7):
        hi = 'E:/IBM_Hack/low_freq/house_{}/labels.dat'.format(i)
        label[i] = {}
        with open(hi) as f:
            for line in f:
                splitted_line = line.split(' ')
                label[i][int(splitted_line[0])] = splitted_line[1].strip() + '_' + splitted_line[0]
    return label
labels = read_label()
for i in range(1,7):
    print('House {}: '.format(i), labels[i], '\n')

House 1:  {1: 'mains_1', 2: 'mains_2', 3: 'oven_3', 4: 'oven_4', 5: 'refrigerator_5', 6: 'dishwaser_6', 7: 'kitchen_outlets_7', 8: 'kitchen_outlets_8', 9: 'lighting_9', 10: 'washer_dryer_10', 11: 'microwave_11', 12: 'bathroom_gfi_12', 13: 'electric_heat_13', 14: 'stove_14', 15: 'kitchen_outlets_15', 16: 'kitchen_outlets_16', 17: 'lighting_17', 18: 'lighting_18', 19: 'washer_dryer_19', 20: 'washer_dryer_20'} 

House 2:  {1: 'mains_1', 2: 'mains_2', 3: 'kitchen_outlets_3', 4: 'lighting_4', 5: 'stove_5', 6: 'microwave_6', 7: 'washer_dryer_7', 8: 'kitchen_outlets_8', 9: 'refrigerator_9', 10: 'dishwaser_10', 11: 'disposal_11'} 

House 3:  {1: 'mains_1', 2: 'mains_2', 3: 'outlets_unknown_3', 4: 'outlets_unknown_4', 5: 'lighting_5', 6: 'electronics_6', 7: 'refrigerator_7', 8: 'disposal_8', 9: 'dishwaser_9', 10: 'furance_10', 11: 'lighting_11', 12: 'outlets_unknown_12', 13: 'washer_dryer_13', 14: 'washer_dryer_14', 15: 'lighting_15', 16: 'microwave_16', 17: 'lighting_17', 18: 'smoke_alarms_18'

# **Data Aggregation**

In [7]:
def read_merge_data(house):
    path = 'E:/IBM_Hack/low_freq_rect_samp/house_{}/'.format(house)
    file = path + 'channel_1_1.dat'
    
    df = pd.read_table(file, sep = ' ', names = ['unix_time', labels[house][1]], 
                                       dtype = {'unix_time': 'int64', labels[house][1]:'float64'}) 
    num_apps = len(glob.glob(path + 'channel*'))
    
    for i in range(2, num_apps + 1):
        file = path + 'channel_{}'.format(i)+'_1.dat'
        data = pd.read_table(file, sep = ' ', names = ['unix_time', labels[house][i]], 
                                       dtype = {'unix_time': 'int64', labels[house][i]:'float64'})
        df = pd.merge_asof(df, data,on = 'unix_time',tolerance=1800)
    
    df=df.where(pd.notnull(df),0)
    df['total']=df.sum(axis=1)
    appliances=list(df.keys())

    normalized_df=preprocessing.scale(df.loc[:,appliances[3]:appliances[-1]])
    df1=pd.DataFrame(normalized_df,columns=appliances[3:])
    df1=abs(df1)
    df1['unix_time'] = df['unix_time']
    df1['timestamp'] = df['unix_time'].astype("datetime64[s]")
    df1 = df1.set_index(df1['timestamp'].values)
    df1.drop(['unix_time','timestamp'], axis=1, inplace=True)
    
    df1['time_bucket'] = pd.cut(df1['total'], [0,0.7,0.9,1.2,100], labels=['1','2','3','4'])

    print("for house"+str(house)+"\n")
    print(df1['time_bucket'].value_counts().to_string())
    print("\n"+"-----------"+"\n")

#     f= open('E:/IBM_Hack/low_freq_rect_norm/house_{}/'.format(house)+'channel_{}'.format(house)+'.dat','a+') 
#     f.write(df1.to_string())
#     f.close()
    return df1
df = {}
pd.set_option('display.max_colwidth',1000)
pd.set_option('display.max_rows',-1)
for i in range(1,7):
    df[i] = read_merge_data(i)

for house1

1    433
3    229
4    108
2     98

-----------

for house2

1    334
3    139
4     99
2     94

-----------

for house3

3    229
4    199
2    193
1    172

-----------

for house4

1    312
4    272
3    218
2    132

-----------

for house5

3    101
1     41
2     16
4     10

-----------

for house6

1    153
4    142
3    120
2     79

-----------



In [8]:
dates = {}
for i in range(1,7):
    dates[i] = [str(time)[:10] for time in df[i].index.values]
    dates[i] = sorted(list(set(dates[i])))
    print('House {0} data contain {1} days from {2} to {3}.'.format(i,len(dates[i]),dates[i][0], dates[i][-1]))
    print(dates[i], '\n')

House 1 data contain 23 days from 2011-04-18 to 2011-05-24.
['2011-04-18', '2011-04-19', '2011-04-20', '2011-04-21', '2011-04-22', '2011-04-23', '2011-04-24', '2011-04-25', '2011-04-26', '2011-04-27', '2011-04-28', '2011-04-30', '2011-05-01', '2011-05-02', '2011-05-03', '2011-05-06', '2011-05-07', '2011-05-11', '2011-05-12', '2011-05-13', '2011-05-22', '2011-05-23', '2011-05-24'] 

House 2 data contain 17 days from 2011-04-17 to 2011-05-22.
['2011-04-17', '2011-04-18', '2011-04-19', '2011-04-20', '2011-04-21', '2011-04-22', '2011-04-23', '2011-04-24', '2011-04-25', '2011-04-26', '2011-04-27', '2011-04-28', '2011-04-29', '2011-04-30', '2011-05-01', '2011-05-02', '2011-05-22'] 

House 3 data contain 24 days from 2011-04-16 to 2011-05-28.
['2011-04-16', '2011-04-17', '2011-04-18', '2011-04-19', '2011-04-20', '2011-04-21', '2011-04-22', '2011-04-23', '2011-04-24', '2011-04-25', '2011-04-26', '2011-04-27', '2011-04-28', '2011-05-17', '2011-05-18', '2011-05-19', '2011-05-21', '2011-05-22', '

# **Training on Timestamp Bucketting**

## **Grid Search on KNN**

In [19]:
params = {"n_neighbors": np.arange(1, 28, 1),"algorithm" : ["brute"],
         "leaf_size": np.arange(1,50,1), "metric": ["cityblock"]}
nn_param=[0]*(len(labels)+1)
algo_param=[" "]*(len(labels)+1)
leaf_param=[0]*(len(labels)+1)
metric_param=[" "]*(len(labels)+1)
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df[i].iloc[1:,0:-2], df[i]['time_bucket'][1:], test_size=0.20, random_state=123)
        print("[INFO] tuning hyperparameters via grid search")
        model = KNeighborsClassifier()
        grid = GridSearchCV(model, params, n_jobs=-1)
        start = time.time()
        grid.fit(X_train, y_train)
        print("[INFO] grid search took {:.2f} seconds".format(
            time.time() - start))
        acc = grid.score(X_test, y_test)
        print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))
        nn_param[i]=grid.best_params_['n_neighbors']
        algo_param[i]=grid.best_params_['algorithm']
        leaf_param[i]=grid.best_params_['leaf_size']
        metric_param[i]=grid.best_params_['metric']
        print("[INFO] grid search best parameters: {}".format(
            grid.best_params_))
        print("------------\n")
print(metric_param)

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 78.13 seconds
[INFO] grid search accuracy: 55.75%
[INFO] grid search best parameters: {'algorithm': 'brute', 'leaf_size': 1, 'metric': 'cityblock', 'n_neighbors': 3}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 38.09 seconds
[INFO] grid search accuracy: 51.88%
[INFO] grid search best parameters: {'algorithm': 'brute', 'leaf_size': 1, 'metric': 'cityblock', 'n_neighbors': 14}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 44.43 seconds
[INFO] grid search accuracy: 55.97%
[INFO] grid search best parameters: {'algorithm': 'brute', 'leaf_size': 1, 'metric': 'cityblock', 'n_neighbors': 3}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 57.71 seconds
[INFO] grid search accuracy: 60.96%
[INFO] grid search best parameters: {'algorithm': 'brute', 'leaf_size': 1, 'metric': 'cityblock', 'n_neighbors': 1}
------------


## **Using KNN**

In [15]:
avg=0
c=0
predicted = {}
predict_prob = {}
expected = {}
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df[i].iloc[1:,0:-2], df[i]['time_bucket'][1:], test_size=0.25, random_state=123)
        knn = KNeighborsClassifier(n_neighbors=6,leaf_size=1,metric='cityblock',algorithm='brute')
        knn.fit(X_train,y_train)
        expected[i]=y_test
        predicted[i]=knn.predict(X_test)
        predict_prob[i]=knn.predict_proba(X_test)
        avg+=knn.score(X_test,y_test)*100
        c+=1
print(" Accuracy on Test Set : {:.2f}%".format(avg/c))


 Accuracy on Test Set : 57.81%


In [9]:
for i in range(1,len(labels)+1):
    print("for house "+str(i)+"\n")
    print("expected : "+str(expected[i][0:11]))
    print("prediction : "+str(predicted[i][0:11]))
    print("prediction probability: "+str(predict_prob[i][0:11]))
    print("--------------------------")

for house 1

expected :                        1
2011-05-02 16:27:37    3
2011-05-12 10:48:51    1
2011-04-24 02:32:27    1
2011-04-24 03:32:27    3
2011-04-19 15:56:39    3
2011-05-11 16:48:51    4
2011-05-23 12:36:31    1
2011-04-25 09:07:08    2
2011-04-22 17:00:45    2
2011-04-22 10:00:44    3
2011-05-12 10:48:51    1
2011-04-24 02:32:27    1
2011-04-24 03:32:27    3
2011-04-19 15:56:39    3
2011-05-11 16:48:51    4
2011-05-23 12:36:31    1
2011-04-25 09:07:08    2
2011-04-22 17:00:45    2
2011-04-22 10:00:44   ..
2011-04-19 16:57:17    3
Name: time_bucket, Length: 11, dtype: category
Categories (4, object): [1 < 2 < 3 < 4]
prediction : ['1' '3' '3' '1' '1' '3' '1' '1' '3' '1' '1']
prediction probability: [[0.83333333 0.16666667 0.         0.        ]
 [0.33333333 0.         0.5        0.16666667]
 [0.33333333 0.         0.5        0.16666667]
 [0.83333333 0.         0.16666667 0.        ]
 [0.66666667 0.16666667 0.16666667 0.        ]
 [0.         0.         1.         0.        ]

## **Grid search on Random Forest**

In [11]:
params = {'max_depth': [10,15,20,25,35,45,50,100,120],'max_features': ["log2", "sqrt", "auto"],
          'n_estimators': [20,50,100, 200, 300, 1000, 1500]}

for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df[i].iloc[1:,0:-2], df[i]['time_bucket'][1:], test_size=0.20, random_state=123)
        print("[INFO] tuning hyperparameters via grid search")
        model = RandomForestClassifier()
        grid = GridSearchCV(model, params, n_jobs=-1)
        start = time.time()
        grid.fit(X_train, y_train)
       
        print("[INFO] grid search took {:.2f} seconds".format(
            time.time() - start))
        acc = grid.score(X_test, y_test)
        print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))

        print("[INFO] grid search best parameters: {}".format(
            grid.best_params_))
        print("------------\n")


[INFO] tuning hyperparameters via grid search
[INFO] grid search took 67.59 seconds
[INFO] grid search accuracy: 68.97%
[INFO] grid search best parameters: {'max_depth': 100, 'max_features': 'auto', 'n_estimators': 200}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 53.18 seconds
[INFO] grid search accuracy: 55.64%
[INFO] grid search best parameters: {'max_depth': 25, 'max_features': 'log2', 'n_estimators': 100}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 61.76 seconds
[INFO] grid search accuracy: 69.18%
[INFO] grid search best parameters: {'max_depth': 45, 'max_features': 'sqrt', 'n_estimators': 300}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 60.60 seconds
[INFO] grid search accuracy: 73.80%
[INFO] grid search best parameters: {'max_depth': 120, 'max_features': 'sqrt', 'n_estimators': 1000}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search 

## **Using Random Forest**

In [16]:
avg=0
c=0
predicted1 = {}
predict_prob1 = {}
expected1 = {}
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df[i].iloc[1:,0:-2], df[i]['time_bucket'][1:], test_size=0.20, random_state=123)

        rf=RandomForestClassifier(n_estimators=300,max_depth=15,n_jobs =-1,
                                  max_features = "sqrt", min_samples_split=10,
                                  min_samples_leaf=5,random_state=123)
        rf.fit(X_train,y_train)
        expected1[i]=y_test
        predicted1[i]=rf.predict(X_test)
        predict_prob1[i]=rf.predict_proba(X_test)
        avg+=rf.score(X_test,y_test)*100
        c+=1
           
print(" Random Forest Accuracy on Test Set : {:.2f}%".format(avg/c))


 Random Forest Accuracy on Test Set : 69.59%


## **Grid Search on XGBClassifer**

In [13]:
params = {
        'min_child_weight': [1, 3, 5, 7, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6, 7]
        }
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df[i].iloc[1:,0:-1], df[i]['time_bucket'][1:], test_size=0.20, random_state=123)
        print("[INFO] tuning hyperparameters via grid search")
        xgb = xgboost.XGBClassifier(learning_rate=0.1, booster ='gbtree',objective='multi:softprob',
                                    silent=True, nthread=None, n_jobs=-1,num_class=3)
        grid = GridSearchCV(xgb, params, n_jobs=-1)
        start = time.time()
        grid.fit(X_train, y_train)
        
        print("[INFO] grid search took {:.2f} seconds".format(
            time.time() - start))
        acc = grid.score(X_test, y_test)
        print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))

        print("[INFO] grid search best parameters: {}".format(
            grid.best_params_))
        print("------------\n")


[INFO] tuning hyperparameters via grid search
[INFO] grid search took 442.61 seconds
[INFO] grid search accuracy: 99.43%
[INFO] grid search best parameters: {'colsample_bytree': 0.6, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.6}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 409.47 seconds
[INFO] grid search accuracy: 100.00%
[INFO] grid search best parameters: {'colsample_bytree': 0.6, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.6}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 467.41 seconds
[INFO] grid search accuracy: 99.37%
[INFO] grid search best parameters: {'colsample_bytree': 0.6, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.6}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 495.92 seconds
[INFO] grid search accuracy: 100.00%
[INFO] grid search best parameters: {'colsample_bytree': 0.6, 'gamma': 0.

## **Using XGBClassifier**

In [10]:
xg_clas=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                               colsample_bytree=0.6, gamma=85, learning_rate=0.3, max_delta_step=0,
                               max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
                               n_jobs=-1, nthread=None, objective='multi:softprob', random_state=123,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                               silent=True, subsample=0.8,num_class=4)
avg=0
c=0
predicted = {}
predict_prob = {}
expected = {}
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df[i].iloc[0:,0:-1], df[i]['time_bucket'][0:], test_size=0.20, random_state=123)

        xg_clas.fit(X_train,y_train)
        expected[i]=y_test
        predicted[i]=xg_clas.predict(X_test)
        predict_prob[i]=xg_clas.predict_proba(X_test)
        avg+=xg_clas.score(X_test,y_test)*100
        c+=1
            
print(" XGBoost Accuracy on Test Set : {:.2f}%".format(avg/c))



 XGBoost Accuracy on Test Set : 89.79%


In [19]:
for i in range(1,len(labels)+1):
    print("for house "+str(i)+"\n")
    print("expected : "+str(expected[i][0:5]))
    print("prediction : "+str(predicted[i][0:5]))
    print("prediction probability: "+str(predict_prob[i][0:5]))
    print("--------------------------")

for house 1

expected :                        1
2011-05-01 09:24:47    3
2011-04-19 16:27:17    1
2011-04-30 16:54:46    4
2011-05-22 20:33:51    3
2011-04-19 16:27:17    1
2011-04-30 16:54:46    4
2011-05-22 20:33:51   ..
2011-04-25 01:37:08    1
Name: time_bucket, Length: 5, dtype: category
Categories (4, object): [1 < 2 < 3 < 4]
prediction : ['1' '3' '1' '4' '1']
prediction probability: [[0.79170346 0.05072726 0.11044642 0.0471229 ]
 [0.2097609  0.12910151 0.5412762  0.11986138]
 [0.79170346 0.05072726 0.11044642 0.0471229 ]
 [0.20234112 0.12453487 0.26467165 0.4084524 ]
 [0.79170346 0.05072726 0.11044642 0.0471229 ]]
--------------------------
for house 2

expected :                        1
2011-04-24 01:52:10    1
2011-04-27 07:31:55    1
2011-04-24 23:52:19    2
2011-04-29 22:32:02    1
2011-04-27 07:31:55    1
2011-04-24 23:52:19    2
2011-04-29 22:32:02   ..
2011-04-23 08:47:03    1
Name: time_bucket, Length: 5, dtype: category
Categories (4, object): [1 < 2 < 3 < 4]
predicti

## **Application Bucketting**

In [12]:
df1 = {}
for i in range(1,7):
    df1[i] = df[i].iloc[:,0:-2].transpose()
    df1[i]['usage'] = abs(preprocessing.scale(df1[i].sum(axis=1)))
    df1[i]['app_bucket'] = pd.cut(df1[i]['usage'], [0,0.5,1,100], labels=['1','2','3'])
    print("for house"+str(i)+"\n")
    print(df1[i]['app_bucket'].value_counts().to_string())
    print("\n"+"-----------"+"\n")

# print(df1[2].iloc[0:2,0].to_string())

for house1

3    8
2    6
1    4

-----------

for house2

3    4
2    3
1    2

-----------

for house3

3    8
2    8
1    4

-----------

for house4

2    8
3    7
1    3

-----------

for house5

3    8
2    8
1    8

-----------

for house6

3    7
2    4
1    4

-----------



## **Grid Search on KNN**

In [27]:
params = {"n_neighbors": np.arange(1, 3, 1),"algorithm" : ["brute"],
         "leaf_size": np.arange(1,50,1), "metric": ["minkowski","euclidean","cityblock"]}
nn_param=[0]*(len(labels)+1)
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df1[i].iloc[:,0:-1], df1[i]['app_bucket'], test_size=0.30, random_state=123)
        print("[INFO] tuning hyperparameters via grid search")
        model = KNeighborsClassifier()
        grid = GridSearchCV(model, params, n_jobs=-1)
        start = time.time()
        grid.fit(X_train, y_train)
        
        print("[INFO] grid search took {:.2f} seconds".format(
            time.time() - start))
        acc = grid.score(X_test, y_test)
        print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))
        nn_param[i]=grid.best_params_['n_neighbors']
        print("[INFO] grid search best parameters: {}".format(
            grid.best_params_))
        print("------------\n")
# print(nn_param)

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 10.01 seconds
[INFO] grid search accuracy: 83.33%
[INFO] grid search best parameters: {'algorithm': 'brute', 'leaf_size': 1, 'metric': 'cityblock', 'n_neighbors': 1}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 0.95 seconds
[INFO] grid search accuracy: 33.33%
[INFO] grid search best parameters: {'algorithm': 'brute', 'leaf_size': 1, 'metric': 'cityblock', 'n_neighbors': 1}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 1.06 seconds
[INFO] grid search accuracy: 50.00%
[INFO] grid search best parameters: {'algorithm': 'brute', 'leaf_size': 1, 'metric': 'minkowski', 'n_neighbors': 1}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 1.16 seconds
[INFO] grid search accuracy: 100.00%
[INFO] grid search best parameters: {'algorithm': 'brute', 'leaf_size': 1, 'metric': 'cityblock', 'n_neighbors': 1}
------------

[I

## **Using KNN**

In [21]:
avg=0
c=0
predicted1 = {}
predict_prob1 = {}
expected1 = {}
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df1[i].iloc[:,0:-1], df1[i]['app_bucket'], test_size=0.30, random_state=123)

        knn = KNeighborsClassifier(n_neighbors=1, algorithm = 'brute', leaf_size=1, metric = 'cityblock')
        knn.fit(X_train,y_train)
        expected1[i]=y_test
        predicted1[i]=knn.predict(X_test)
        predict_prob1[i]=knn.predict_proba(X_test)
        avg+=knn.score(X_test,y_test)*100
        c+=1
            
print(" Accuracy on Test Set : {:.2f}%".format(avg/c))

 Accuracy on Test Set : 67.64%


In [22]:
for i in range(1,len(labels)+1):
    print("for house "+str(i)+"\n")
    print("expected : \n"+expected1[i].to_string())
    print("prediction : \n"+str(predicted1[i]))
    print("prediction probability: \n"+str(predict_prob1[i]))
    print("--------------------------")

for house 1

expected : 
kitchen_outlets_7    3
stove_14             1
lighting_17          3
microwave_11         1
kitchen_outlets_8    2
washer_dryer_10      2
Categories (3, object): [1 < 2 < 3]
prediction : 
['3' '1' '3' '1' '3' '2']
prediction probability: 
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
--------------------------
for house 2

expected : 
dishwaser_10         3
kitchen_outlets_3    2
kitchen_outlets_8    1
Categories (3, object): [1 < 2 < 3]
prediction : 
['2' '1' '1']
prediction probability: 
[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]
--------------------------
for house 3

expected : 
lighting_17        3
disposal_8         1
refrigerator_7     3
bathroom_gfi_20    3
lighting_11        2
furance_10         2
Categories (3, object): [1 < 2 < 3]
prediction : 
['2' '3' '3' '3' '2' '2']
prediction probability: 
[[0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]]
--------------------------
for house 4

expected : 
washer_dryer

## **Grid Search on Random Forest**

In [28]:
params = {'max_depth': [10,15,20,25,35,45],'max_features': ["log2", "sqrt", "auto"],
          'n_estimators': [20,50,100, 200, 300, 1000, 1500],'criterion':['gini','entropy']}

for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df1[i].iloc[0:,0:-1], df1[i]['app_bucket'][:], test_size=0.30, random_state=123)
        print("[INFO] tuning hyperparameters via grid search")
        model = RandomForestClassifier()
        grid = GridSearchCV(model, params, n_jobs=-1)
        start = time.time()
        grid.fit(X_train, y_train)
        
        print("[INFO] grid search took {:.2f} seconds".format(
            time.time() - start))
        acc = grid.score(X_test, y_test)
        print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))

        print("[INFO] grid search best parameters: {}".format(
            grid.best_params_))
        print("------------\n")


[INFO] tuning hyperparameters via grid search
[INFO] grid search took 51.92 seconds
[INFO] grid search accuracy: 83.33%
[INFO] grid search best parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 100}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 45.78 seconds
[INFO] grid search accuracy: 33.33%
[INFO] grid search best parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 50}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 48.95 seconds
[INFO] grid search accuracy: 83.33%
[INFO] grid search best parameters: {'criterion': 'gini', 'max_depth': 45, 'max_features': 'sqrt', 'n_estimators': 50}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 47.70 seconds
[INFO] grid search accuracy: 50.00%
[INFO] grid search best parameters: {'criterion': 'entropy', 'max_depth': 45, 'max_features': 'auto', 'n_estimators': 20

## **Using Random Forest**

In [23]:
avg=0
c=0
predicted1 = {}
predict_prob1 = {}
expected1 = {}
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df1[i].iloc[0:,0:-1], df1[i]['app_bucket'][0:], test_size=0.30, random_state=123)

        rf=RandomForestClassifier(n_estimators=20,criterion='entropy',max_features = "log2",max_depth=45,random_state=123)
        rf.fit(X_train,y_train)
        expected1[i]=y_test
        predicted1[i]=rf.predict(X_test)
        predict_prob1[i]=rf.predict_proba(X_test)
        avg+=rf.score(X_test,y_test)*100
        c+=1
           
print(" Random Forest Accuracy on Test Set : {:.2f}%".format(avg/c))


 Random Forest Accuracy on Test Set : 66.81%


In [24]:
for i in range(1,len(labels)+1):
    print("for house "+str(i)+"\n")
    print("expected : \n"+expected1[i].to_string())
    print("prediction : \n"+str(predicted1[i]))
    print("prediction probability: \n"+str(predict_prob1[i]))
    print("--------------------------")

for house 1

expected : 
kitchen_outlets_7    3
stove_14             1
lighting_17          3
microwave_11         1
kitchen_outlets_8    2
washer_dryer_10      2
Categories (3, object): [1 < 2 < 3]
prediction : 
['3' '1' '3' '1' '3' '2']
prediction probability: 
[[0.15 0.05 0.8 ]
 [0.8  0.1  0.1 ]
 [0.15 0.05 0.8 ]
 [0.5  0.25 0.25]
 [0.   0.35 0.65]
 [0.   0.8  0.2 ]]
--------------------------
for house 2

expected : 
dishwaser_10         3
kitchen_outlets_3    2
kitchen_outlets_8    1
Categories (3, object): [1 < 2 < 3]
prediction : 
['2' '2' '2']
prediction probability: 
[[0.2  0.5  0.3 ]
 [0.1  0.55 0.35]
 [0.35 0.45 0.2 ]]
--------------------------
for house 3

expected : 
lighting_17        3
disposal_8         1
refrigerator_7     3
bathroom_gfi_20    3
lighting_11        2
furance_10         2
Categories (3, object): [1 < 2 < 3]
prediction : 
['3' '1' '3' '2' '2' '2']
prediction probability: 
[[0.15 0.4  0.45]
 [0.4  0.4  0.2 ]
 [0.15 0.35 0.5 ]
 [0.05 0.8  0.15]
 [0.2  0.55

## **Grid Search on XGBClassifier**

In [30]:
params = {
        'min_child_weight': [1, 3, 5, 7, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6, 7]
        }
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df1[i].iloc[0:,0:-1], df1[i]['app_bucket'][:], test_size=0.30, random_state=123)
        print("[INFO] tuning hyperparameters via grid search")
        xgb = xgboost.XGBClassifier(learning_rate=0.1, booster ='gbtree',objective='multi:softprob',
                                    silent=True, nthread=None, n_jobs=-1,num_class=3)
        grid = GridSearchCV(xgb, params, n_jobs=-1)
        start = time.time()
        grid.fit(X_train, y_train)
        
        print("[INFO] grid search took {:.2f} seconds".format(
            time.time() - start))
        acc = grid.score(X_test, y_test)
        print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))

        print("[INFO] grid search best parameters: {}".format(
            grid.best_params_))
        print("------------\n")


[INFO] tuning hyperparameters via grid search
[INFO] grid search took 485.41 seconds
[INFO] grid search accuracy: 100.00%
[INFO] grid search best parameters: {'colsample_bytree': 0.6, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.8}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 402.22 seconds
[INFO] grid search accuracy: 33.33%
[INFO] grid search best parameters: {'colsample_bytree': 0.6, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.6}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 466.51 seconds
[INFO] grid search accuracy: 83.33%
[INFO] grid search best parameters: {'colsample_bytree': 0.6, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.8}
------------

[INFO] tuning hyperparameters via grid search
[INFO] grid search took 495.92 seconds
[INFO] grid search accuracy: 83.33%
[INFO] grid search best parameters: {'colsample_bytree': 1.0, 'gamma': 1, 

## **Using XGBClassifier**

In [13]:
xg_clas=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                               colsample_bytree=0.6, gamma=0.7, learning_rate=0.3, max_delta_step=0,
                               max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
                               n_jobs=-1, nthread=None, objective='multi:softprob', random_state=123,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                               silent=True, subsample=0.8,num_class=4)
avg=0
c=0
predicted1 = {}
predict_prob1 = {}
expected1 = {}
for i in range(1,len(labels)+1):
        X_train, X_test, y_train, y_test = train_test_split(df1[i].iloc[0:,0:-1], df1[i]['app_bucket'][0:], test_size=0.30, random_state=123)

        xg_clas.fit(X_train,y_train)
        expected1[i]=y_test
        predicted1[i]=xg_clas.predict(X_test)
        predict_prob1[i]=xg_clas.predict_proba(X_test)
        avg+=xg_clas.score(X_test,y_test)*100
        c+=1
           
print(" XGBoost Accuracy on Test Set : {:.2f}%".format(avg/c))



 XGBoost Accuracy on Test Set : 80.00%


In [26]:
for i in range(1,len(labels)+1):
    print("for house "+str(i)+"\n")
    print("expected : \n"+expected1[i].to_string())
    print("prediction : \n"+str(predicted1[i]))
    print("prediction probability: \n"+str(predict_prob1[i]))
    print("--------------------------")

for house 1

expected : 
kitchen_outlets_7    3
stove_14             1
lighting_17          3
microwave_11         1
kitchen_outlets_8    2
washer_dryer_10      2
Categories (3, object): [1 < 2 < 3]
prediction : 
['3' '1' '3' '1' '1' '2']
prediction probability: 
[[0.0832613  0.06460833 0.85213035]
 [0.5110792  0.22984698 0.25907382]
 [0.10713131 0.06292606 0.82994264]
 [0.5110792  0.22984698 0.25907382]
 [0.35115486 0.34508127 0.3037639 ]
 [0.11331306 0.78728783 0.09939905]]
--------------------------
for house 2

expected : 
dishwaser_10         3
kitchen_outlets_3    2
kitchen_outlets_8    1
Categories (3, object): [1 < 2 < 3]
prediction : 
['3' '2' '2']
prediction probability: 
[[0.07700992 0.26235744 0.6606326 ]
 [0.20251164 0.42034116 0.37714723]
 [0.15951128 0.5434231  0.2970656 ]]
--------------------------
for house 3

expected : 
lighting_17        3
disposal_8         1
refrigerator_7     3
bathroom_gfi_20    3
lighting_11        2
furance_10         2
Categories (3, object)