In [3]:
%matplotlib notebook
#%matplotlib inline

import copy
import numpy as np
import random
import matplotlib.pyplot as plt
from _converter import SensorThings2Dict
from _plotter import plot_confusion_matrix

In [4]:
print("Loading rows...")
with open("ABU1.txt") as f:
    data = []
    bad = 0
    for line in f:
        try:
            features = SensorThings2Dict(line)
            data.append(list(features.values()))
        except Exception, e:
            bad+=1
print("Incomplete rows: {}".format(bad))
print("Loaded: {}".format(len(data)))

""" random split seed """
data = np.asarray(data)
#print(data[:,2:-1])
# numerics = data[:,2:-1]
# for n in numerics[data[:,-1]=='True']:
#     plt.plot(n);
mask = np.random.rand(len(data)) < 0.9

Loading rows...
Incomplete rows: 170
Loaded: 30768


In [5]:
train = data[mask]
test = data[~mask]
print("Train Total: {} Good: {} Faulty: {} Ratio: {}".format(len(train), len(train[train[:,-1]=='True']), len(train[train[:,-1]=='False']), float(len(train[train[:,-1]=='False']))/len(train)))
print("Test  Total: {} Good: {} Faulty: {} Ratio: {}".format(len(test), len(test[test[:,-1]=='True']), len(test[test[:,-1]=='False']), float(len(test[test[:,-1]=='False']))/len(train)))

faulty = train[train[:,-1]=='False']
not_faulty = train[train[:,-1]=='True']

Train Total: 27711 Good: 26106 Faulty: 1605 Ratio: 0.0579192378478
Test  Total: 3057 Good: 2877 Faulty: 180 Ratio: 0.00649561545956


In [6]:
means = np.mean(not_faulty[:,2:-1].astype(np.float32), axis=0)
print(means)

[  99.99999237  199.99961853   50.00053787   59.99983215   30.00049782
   40.00039673   54.99973297   20.99967003   33.00041199   41.00047302
   15.00090408   29.99968338   21.99883461   31.9986496   119.99898529
  119.99828339  600.00067139  599.99902344   50.00023651   60.00033951
   30.00026131   40.00045776   54.99936295   20.999897     33.00076675
   40.99879456   15.00036144   29.99931717   22.0002594    31.99923706
   99.96162415  249.99751282  159.96948242   50.00000381   60.00068283
   29.99971199   39.99873734   55.00039291   21.00016594   32.99966812
   41.00033951   15.00040627   30.0006485    21.99996948   31.99973106
   10.00106621   12.98250389    7.99255991    8.00312901    5.98762035
  200.0256958 ]


In [7]:
# train = np.concatenate((not_faulty[:len(faulty)*5], faulty))
samples = np.random.choice(len(not_faulty), 5000, replace=False)
train = np.concatenate((not_faulty[samples], faulty))
print("Train Total: {} Good: {} Faulty: {} Ratio: {}".format(len(train), len(train[train[:,-1]=='True']), len(train[train[:,-1]=='False']), float(len(train[train[:,-1]=='False']))/len(train)))

train_data = train[:,2:-1].astype(np.float32)
test_data = test[:,2:-1].astype(np.float32)
"""
Quality_OK is mapped to Faultiness
    'False' -> 1 (Faulty)
    'True'  -> 0 (Good)
"""
train_labels = np.array(train[:,-1]=='False').astype(np.int32)
test_labels = np.array(test[:,-1]=='False').astype(np.int32)

Train Total: 6605 Good: 5000 Faulty: 1605 Ratio: 0.242997728993


In [8]:
from sklearn import metrics
from IPython.display import display, HTML
import pandas as pd

def eval_metrics(expected, predicted):
    print("------------------------- EVALUATION -------------------------")
#     print("Accuracy Score: {}".format(metrics.accuracy_score(expected, predicted)))
    print(metrics.classification_report(expected, predicted))
    print("Confusion Matrix:")
#     print(metrics.confusion_matrix(expected, predicted))
    display(pd.DataFrame(metrics.confusion_matrix(expected, predicted)))
#     print("Kappa Score: {}".format(metrics.cohen_kappa_score(expected, predicted)))
    print("Matthews Correlation Coefficient: {}".format(metrics.matthews_corrcoef(expected, predicted)))
    print("--------------------------------------------------------------")
#     plot_confusion_matrix(metrics.confusion_matrix(expected, predicted),classes=['Good','Faulty'],title='Confusion matrix',normalize=True)
#     plt.show()

In [26]:
""" Train for different feature spaces """
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# ratio = float(len(train[train[:,-1]=='False']))/len(train)
criteria = ["gini", "entropy"]
clf = DecisionTreeClassifier(criterion=criteria[0], random_state=1, max_depth=100,  class_weight={0:1.0, 1:0.05})
# clf = RandomForestClassifier(n_estimators=10,  class_weight={0:1.0, 1:0.05}, max_depth=90, n_jobs=4)

preds = []
choices = np.arange(30,51+1, 5)
ensemples_per_feature_space = 1
for i,c in enumerate(choices):
    # train multiple trees for each f-space
    for _ in range(ensemples_per_feature_space):
#         r = np.random.choice(c, c*4/5, replace=False)
        r = np.arange(0,c)
        print(r.shape)
        clf.fit(train_data[:,r], train_labels)
        preds.append(clf.predict_proba(test_data[:,r]))
preds = np.asarray(preds)
print(preds.shape)

(51L,)
(1L, 3068L, 2L)


In [201]:
# calculate weight for each tree
weights = (choices/2)-14
print(np.sum(weights))
print(weights)
# take weighted average of predictions
# m = np.average(preds, axis=0, weights=np.repeat(weights, ensemples_per_feature_space, axis=0))
m = preds[-1]
predictions = (m[:,0]<m[:,1]).astype(np.int32)
eval_metrics(test_labels, predictions)

29
[ 1  3  6  8 11]
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      0.99      1.00      2848
          1       0.91      0.97      0.94       201

avg / total       0.99      0.99      0.99      3049

Confusion Matrix:


Unnamed: 0,0,1
0,2828,20
1,6,195


Matthews Correlation Coefficient: 0.933538703002
--------------------------------------------------------------


In [137]:
# """ ExtraTreeClassifier """
# from sklearn.tree import ExtraTreeClassifier

# # ratio = float(len(train[train[:,-1]=='False']))/len(train)
# clf = ExtraTreeClassifier(max_depth=70)
# %time t = clf.fit(train_data, train_labels)

# eval_metrics(train_labels, clf.predict(train_data))
# eval_metrics(test_labels, clf.predict(test_data))

In [138]:
""" Random Forest """
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,  class_weight={0:1.0, 1:0.05}, max_depth=90, n_jobs=4)

%time clf = clf.fit(train_data, train_labels)
eval_metrics(train_labels, clf.predict(train_data))
eval_metrics(test_labels, clf.predict(test_data))

Wall time: 4.3 s
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       0.99      1.00      1.00      5000
          1       1.00      0.98      0.99      1584

avg / total       1.00      0.99      0.99      6584

Confusion Matrix:


Unnamed: 0,0,1
0,5000,0
1,33,1551


Matthews Correlation Coefficient: 0.986279138736
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      0.99      1.00      2848
          1       0.92      0.96      0.94       201

avg / total       0.99      0.99      0.99      3049

Confusion Matrix:


Unnamed: 0,0,1
0,2831,17
1,8,193


Matthews Correlation Coefficient: 0.935036792215
--------------------------------------------------------------


In [17]:
""" Fill remaining features with means """
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,  class_weight={0:1.0, 1:0.05}, max_depth=90, n_jobs=4)

%time clf = clf.fit(train_data, train_labels)
eval_metrics(train_labels, clf.predict(train_data))
eval_metrics(test_labels, clf.predict(test_data))

f = np.where(test_labels==0)[0]
partial_test = copy.deepcopy(test_data[f])

for r in partial_test:
    r[30:] = means[30:]
eval_metrics(test_labels[f], clf.predict(partial_test))

Wall time: 4.16 s
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5000
          1       1.00      0.99      1.00      1605

avg / total       1.00      1.00      1.00      6605

Confusion Matrix:


Unnamed: 0,0,1
0,5000,0
1,15,1590


Matthews Correlation Coefficient: 0.993826510117
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      0.99      1.00      2877
          1       0.92      0.99      0.95       180

avg / total       0.99      0.99      0.99      3057

Confusion Matrix:


Unnamed: 0,0,1
0,2861,16
1,1,179


Matthews Correlation Coefficient: 0.952574641858
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2877
          1       0.00      0.00      0.00         0

avg / total       1.00      1.00      1.00      2877

Confusion Matrix:


Unnamed: 0,0,1
0,2868,9
1,0,0


Matthews Correlation Coefficient: 0.0
--------------------------------------------------------------
