In [27]:
# %matplotlib notebook
%matplotlib inline

import copy
import numpy as np
import random
import matplotlib.pyplot as plt
from _converter import SensorThings2Dict
from _evaluation import print_metrics

_false = "False"
_true = "True"

In [28]:
import json
files = ["ABU1.txt"] # "ABU1.2.txt"
data = []
bad = 0
for filename in files:
    print("Loading rows... {}".format(filename))
    with open(filename) as f:
        for line in f:
            try:
                features = SensorThings2Dict(json.loads(line))
                data.append(list(features.values()))
            except Exception, e:
                bad+=1

print("Incomplete rows: {}".format(bad))
print("Loaded: {}".format(len(data)))

""" random split seed """
data = np.asarray(data)
print("Good: {} Faulty: {}".format(len(data[data[:,-1]==_true]), len(data[data[:,-1]==_false])))
#print(data[:,2:-1])
# numerics = data[:,2:-1]
# for n in numerics[data[:,-1]=='True']:
#     plt.plot(n);
mask = np.random.rand(len(data)) < 0.9

Loading rows... ABU1.txt
Incomplete rows: 170
Loaded: 30768
Good: 28983 Faulty: 1785


In [29]:


train = data[mask]
test = data[~mask]
faulty = train[train[:,-1]==_false]
not_faulty = train[train[:,-1]==_true]
fr = len(faulty)/float(len(train))
print("Train Total: {} Good: {} Faulty: {} Ratio: {}".format(len(train), len(not_faulty), len(faulty), fr))
print("Test  Total: {} Good: {} Faulty: {} Ratio: {}".format(len(test), len(test[test[:,-1]==_true]), len(test[test[:,-1]==_false]), float(len(test[test[:,-1]==_false]))/len(train)))


print("Re-sampling...")
from scipy.stats import logistic

# Take random numbers from a logistic probability density function
def logistic_choice(total, sample_size, replace=False):
    p = logistic.pdf(np.arange(0,total), loc=0, scale=total/5)
    p /= np.sum(p)
    
    return np.random.choice(total, size=sample_size, replace=replace, p=p)

sample_size = np.min([5000, len(not_faulty)])
samples = logistic_choice(len(not_faulty), sample_size)
# TODO: Upsample faulties with logistic_choice(replace=True)
f_sample_size = np.min([1000, len(faulty)])
f_samples = logistic_choice(len(faulty), f_sample_size)
# Put samples together and shuffle
train = np.concatenate((not_faulty[samples], faulty[f_samples]))
train = np.random.permutation(train)

faulty = train[train[:,-1]==_false]
not_faulty = train[train[:,-1]==_true]
fr = len(faulty)/float(len(train))
print("Train Total: {} Good: {} Faulty: {} Ratio: {}".format(len(train), len(not_faulty), len(faulty), fr))

train_data = train[:,2:-1].astype(np.float32)
test_data = test[:,2:-1].astype(np.float32)
"""
Quality_OK is mapped to Faultiness
    'False' -> 1 (Faulty)
    'True'  -> 0 (Good)
"""
train_labels = np.array(train[:,-1]==_false).astype(np.int32)
test_labels = np.array(test[:,-1]==_false).astype(np.int32)

Train Total: 27713 Good: 26102 Faulty: 1611 Ratio: 0.0581315628045
Test  Total: 3055 Good: 2881 Faulty: 174 Ratio: 0.00627864179266
Re-sampling...
Train Total: 6000 Good: 5000 Faulty: 1000 Ratio: 0.166666666667


In [30]:
""" Classification and Regression Trees (CART) """
from sklearn.tree import DecisionTreeClassifier

criteria = ["gini", "entropy"]
clf = DecisionTreeClassifier(criterion=criteria[0], random_state=1, max_depth=100,  class_weight={0:1-fr, 1:fr})

%time t = clf.fit(train_data, train_labels)
print_metrics(train_labels, clf.predict(train_data))
print_metrics(test_labels, clf.predict(test_data))

Wall time: 1.17 s
------------------------- EVALUATION -------------------------
Confusion Matrix:
[[5000    0]
 [  12  988]]
Matthews Correlation Coefficient: 0.99279125563
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
Confusion Matrix:
[[2864   17]
 [   6  168]]
Matthews Correlation Coefficient: 0.93243196466
--------------------------------------------------------------


In [31]:
""" Random Forest """
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,  class_weight={0:1-fr, 1:fr}, max_depth=90, n_jobs=4)
%time clf = clf.fit(train_data, train_labels)
print_metrics(train_labels, clf.predict(train_data))
print_metrics(test_labels, clf.predict(test_data))

Wall time: 4.21 s
------------------------- EVALUATION -------------------------
Confusion Matrix:
[[5000    0]
 [  20  980]]
Matthews Correlation Coefficient: 0.987975514641
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
Confusion Matrix:
[[2865   16]
 [  10  164]]
Matthews Correlation Coefficient: 0.922187284807
--------------------------------------------------------------


In [37]:
""" Gradient Boosting """
from sklearn.ensemble import GradientBoostingClassifier

losses = ['deviance', 'exponential']
clf = GradientBoostingClassifier(loss=losses[1], n_estimators=100,  max_depth=25, learning_rate=0.1)

%time clf = clf.fit(train_data, train_labels)
print_metrics(train_labels, clf.predict(train_data))
print_metrics(test_labels, clf.predict(test_data))

Wall time: 8.05 s
------------------------- EVALUATION -------------------------
Confusion Matrix:
[[5000    0]
 [   9  991]]
Matthews Correlation Coefficient: 0.994595096042
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
Confusion Matrix:
[[2866   15]
 [   8  166]]
Matthews Correlation Coefficient: 0.931420506977
--------------------------------------------------------------


In [15]:
from scipy.stats import logistic

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', quality=100)
# from scipy.stats import poisson
# p = poisson.sf(np.arange(0,100),80, loc=-20)
# plt.plot(p)

total = 5000
p = logistic.pdf(np.arange(0,total), loc=0, scale=total/5.0)
# p -= np.min(p)
# p /= np.max(p)
p /= np.sum(p)
plt.grid(linestyle='dotted')
plt.plot(np.flip(p,0))
plt.ylabel("Probability")
plt.xlabel("Datapoint")


# s = np.random.choice(total, size=500, replace=False, p=p)
# x = np.zeros(1000)
# for i in s:
#     x[i] = 1
# plt.plot(x, 'ro')

<matplotlib.text.Text at 0xd61b6d8>

<matplotlib.figure.Figure at 0x15901048>

In [18]:
from collections import deque

d = deque([], maxlen=5)


d.append(np.asarray([1,2,3]))
print(d)
d.append(np.asarray([10,11,12]))
print(np.asarray(d))

deque([array([1, 2, 3])], maxlen=5)
[[ 1  2  3]
 [10 11 12]]
