In [1]:
#%matplotlib notebook
%matplotlib inline

import json
from collections import OrderedDict
import copy
import numpy as np
import matplotlib.pyplot as plt
from _converter import SensorThings2Dict
from _plotter import plot_confusion_matrix

In [2]:
with open("ABU1.txt") as f:
    data = []
    bad = 0
    for line in f:
        try:
            features = SensorThings2Dict(line)
            data.append(list(features.values()))
        except Exception, e:
            bad+=1
print("Incomplete rows: {}".format(bad))
print("Loaded: {}".format(len(data)))

Incomplete rows: 170
Loaded: 30768


In [7]:
from sklearn import preprocessing
data = np.asarray(data)
scaler = preprocessing.StandardScaler().fit(data[:,2:-1])
data[:,2:-1] = scaler.transform(data[:,2:-1])

In [8]:
# data = np.asarray(data)
# from sklearn.preprocessing import normalize
# data[:,2:-1] = normalize(data[:,2:-1])
#print(data[:,2:-1])
# numerics = data[:,2:-1]
# for n in numerics[data[:,-1]=='True']:
#     plt.plot(n);
mask = np.random.rand(len(data)) < 0.9
train = data[mask]
test = data[~mask]
print("Train Total: {} Good: {} Faulty: {} Ratio: {}".format(len(train), len(train[train[:,-1]=='True']), len(train[train[:,-1]=='False']), float(len(train[train[:,-1]=='False']))/len(train)))
print("Test  Total: {} Good: {} Faulty: {} Ratio: {}".format(len(test), len(test[test[:,-1]=='True']), len(test[test[:,-1]=='False']), float(len(test[test[:,-1]=='False']))/len(train)))

Train Total: 27801 Good: 26186 Faulty: 1615 Ratio: 0.0580914355599
Test  Total: 2967 Good: 2797 Faulty: 170 Ratio: 0.00611488795367


In [9]:
""" Balance data """
def balanceData():
    global train
    faulty = train[train[:,-1]=='False']
    not_faulty = train[train[:,-1]=='True']
    train = np.concatenate((not_faulty, np.repeat(faulty, 15, axis=0))) # repeate faulties
    #train = np.concatenate((not_faulty[:len(faulty)*2], faulty))
    print("Train Total: {} Good: {} Faulty: {} Ratio: {}".format(len(train), len(train[train[:,-1]=='True']), len(train[train[:,-1]=='False']), float(len(train[train[:,-1]=='False']))/len(train)))
    train = train[np.random.permutation(train.shape[0])] # shuffle data
balanceData()

Train Total: 50411 Good: 26186 Faulty: 24225 Ratio: 0.480549879987


In [10]:
train_data = train[:,2:-1].astype(np.float32)
test_data = test[:,2:-1].astype(np.float32)
"""
Quality_OK is mapped to Faultiness
    'False' -> 1 (Faulty)
    'True'  -> 0 (Good)
"""
train_labels = np.array(train[:,-1]=='False').astype(np.int32)
test_labels = np.array(test[:,-1]=='False').astype(np.int32)

In [14]:
from IPython.display import display, HTML
import pandas as pd

def eval_metrics(expected, predicted):
    print("------------------------- EVALUATION -------------------------")
#     print("Accuracy Score: {}".format(metrics.accuracy_score(expected, predicted)))
    print(metrics.classification_report(expected, predicted))
    print("Confusion Matrix:")
#     print(metrics.confusion_matrix(expected, predicted))
    display(pd.DataFrame(metrics.confusion_matrix(expected, predicted)))
#     print("Kappa Score: {}".format(metrics.cohen_kappa_score(expected, predicted)))
    print("Matthews Correlation Coefficient: {}".format(metrics.matthews_corrcoef(expected, predicted)))
    print("--------------------------------------------------------------")
#     plot_confusion_matrix(metrics.confusion_matrix(expected, predicted),classes=['Good','Faulty'],title='Confusion matrix',normalize=True)
#     plt.show()

In [15]:
""" MLP """
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

solvers = ['lbfgs', 'sgd', 'adam']
clf = MLPClassifier(solver=solvers[1], alpha=1e-0, hidden_layer_sizes=(50,100,50), random_state=1, 
                    max_iter=1000, verbose=True, tol=1e-10, learning_rate='adaptive')
%time clf.fit(train_data, train_labels)

eval_metrics(train_labels, clf.predict(train_data))
eval_metrics(test_labels, clf.predict(test_data))

Iteration 1, loss = 1.15854799
Iteration 2, loss = 1.13791404
Iteration 3, loss = 1.12084897
Iteration 4, loss = 1.10545460
Iteration 5, loss = 1.09102568
Iteration 6, loss = 1.07735409
Iteration 7, loss = 1.06412014
Iteration 8, loss = 1.05123378
Iteration 9, loss = 1.03870510
Iteration 10, loss = 1.02639776
Iteration 11, loss = 1.01433466
Iteration 12, loss = 1.00240710
Iteration 13, loss = 0.99056632
Iteration 14, loss = 0.97883702
Iteration 15, loss = 0.96716927
Iteration 16, loss = 0.95555723
Iteration 17, loss = 0.94384579
Iteration 18, loss = 0.93207165
Iteration 19, loss = 0.92020235
Iteration 20, loss = 0.90819952
Iteration 21, loss = 0.89587024
Iteration 22, loss = 0.88345574
Iteration 23, loss = 0.87091246
Iteration 24, loss = 0.85830853
Iteration 25, loss = 0.84567266
Iteration 26, loss = 0.83292592
Iteration 27, loss = 0.82007018
Iteration 28, loss = 0.80742872
Iteration 29, loss = 0.79460056
Iteration 30, loss = 0.78196726
Iteration 31, loss = 0.76953872
Iteration 32, los

Unnamed: 0,0,1
0,23129,3057
1,405,23820


Matthews Correlation Coefficient: 0.867794315291
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       0.96      0.83      0.89      2797
          1       0.12      0.37      0.18       170

avg / total       0.91      0.81      0.85      2967

Confusion Matrix:


Unnamed: 0,0,1
0,2333,464
1,107,63


Matthews Correlation Coefficient: 0.124474465705
--------------------------------------------------------------


In [180]:
""" Support Vector Machine """
from sklearn.svm import SVC
from sklearn import metrics

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
clf = SVC(random_state=1, kernel=kernels[2], class_weight='balanced', gamma=0.009, degree=5, C=1.5,tol=1e-6)
%time clf.fit(train_data, train_labels)

eval_metrics(train_labels, clf.predict(train_data))
eval_metrics(test_labels, clf.predict(test_data))

Wall time: 51.5 s
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       0.93      0.85      0.89     10959
          1       0.85      0.93      0.89      9930

avg / total       0.89      0.89      0.89     20889

Confusion Matrix:


Unnamed: 0,0,1
0,9361,1598
1,735,9195


Kappa Score: 0.776995392204
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       0.96      0.81      0.88      1198
          1       0.12      0.45      0.19        71

avg / total       0.91      0.79      0.84      1269

Confusion Matrix:


Unnamed: 0,0,1
0,969,229
1,39,32


Kappa Score: 0.114910760294
--------------------------------------------------------------


In [193]:
""" OneClassSVM """
from sklearn.svm import OneClassSVM
from sklearn import metrics

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
clf = OneClassSVM(kernel=kernels[1], degree=5, gamma=0.02)
clf.fit(train_data[train_labels==0])

eval_metrics(train_labels*-2+1, clf.predict(train_data))
eval_metrics(test_labels*-2+1, clf.predict(test_data))

------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

         -1       0.50      0.86      0.63      9930
          1       0.65      0.23      0.34     10959

avg / total       0.58      0.53      0.48     20889

Confusion Matrix:


Unnamed: 0,0,1
0,8550,1380
1,8450,2509


Kappa Score: 0.0870613238608
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

         -1       0.05      0.86      0.10        71
          1       0.92      0.10      0.18      1198

avg / total       0.87      0.14      0.17      1269

Confusion Matrix:


Unnamed: 0,0,1
0,61,10
1,1081,117


Kappa Score: -0.00533427054613
--------------------------------------------------------------
