In [1]:
%matplotlib notebook
#%matplotlib inline

import copy
import numpy as np
import random
import matplotlib.pyplot as plt
from _converter import SensorThings2Dict
from _evaluation import print_metrics

In [2]:
files = ["ABU1.txt", "ABU1.2.txt"]
data = []
bad = 0
for filename in files:
    print("Loading rows... {}".format(filename))
    with open(filename) as f:
        for line in f:
            try:
                features = SensorThings2Dict(json.loads(line))
                data.append(list(features.values()))
            except Exception, e:
                bad+=1

print("Incomplete rows: {}".format(bad))
print("Loaded: {}".format(len(data)))

""" random split seed """
data = np.asarray(data)
#print(data[:,2:-1])
# numerics = data[:,2:-1]
# for n in numerics[data[:,-1]=='True']:
#     plt.plot(n);
mask = np.random.rand(len(data)) < 0.9

Loading rows...
Incomplete rows: 170
Loaded: 30768


In [3]:
""" split into train and test sets """
train = data[mask]
test = data[~mask]
print("Train Total: {} Good: {} Faulty: {} Ratio: {}".format(len(train), len(train[train[:,-1]=='True']), len(train[train[:,-1]=='False']), float(len(train[train[:,-1]=='False']))/len(train)))
print("Test  Total: {} Good: {} Faulty: {} Ratio: {}".format(len(test), len(test[test[:,-1]=='True']), len(test[test[:,-1]=='False']), float(len(test[test[:,-1]=='False']))/len(train)))

faulty = train[train[:,-1]=='False']
not_faulty = train[train[:,-1]=='True']

means = np.mean(not_faulty[:,2:-1].astype(np.float32), axis=0)

Train Total: 27615 Good: 26027 Faulty: 1588 Ratio: 0.057504979178
Test  Total: 3153 Good: 2956 Faulty: 197 Ratio: 0.00713380409198


In [4]:
""" down/up sample data """
# train = np.concatenate((not_faulty[:len(faulty)*5], faulty))
samples = np.random.choice(len(not_faulty), 5000, replace=False)
train = np.concatenate((not_faulty[samples], faulty))
print("Train Total: {} Good: {} Faulty: {} Ratio: {}".format(len(train), len(train[train[:,-1]=='True']), len(train[train[:,-1]=='False']), float(len(train[train[:,-1]=='False']))/len(train)))

train_data = train[:,2:-1].astype(np.float32)
test_data = test[:,2:-1].astype(np.float32)
"""
Quality_OK is mapped to Faultiness
    'False' -> 1 (Faulty)
    'True'  -> 0 (Good)
"""
train_labels = np.array(train[:,-1]=='False').astype(np.int32)
test_labels = np.array(test[:,-1]=='False').astype(np.int32)

Train Total: 6588 Good: 5000 Faulty: 1588 Ratio: 0.241044323012


In [26]:
""" Train for different feature spaces """
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# ratio = float(len(train[train[:,-1]=='False']))/len(train)
criteria = ["gini", "entropy"]
clf = DecisionTreeClassifier(criterion=criteria[0], random_state=1, max_depth=100,  class_weight={0:1.0, 1:0.05})
# clf = RandomForestClassifier(n_estimators=10,  class_weight={0:1.0, 1:0.05}, max_depth=90, n_jobs=4)

preds = []
choices = np.arange(30,51+1, 5)
ensemples_per_feature_space = 1
for i,c in enumerate(choices):
    # train multiple trees for each f-space
    for _ in range(ensemples_per_feature_space):
#         r = np.random.choice(c, c*4/5, replace=False)
        r = np.arange(0,c)
        print(r.shape)
        clf.fit(train_data[:,r], train_labels)
        preds.append(clf.predict_proba(test_data[:,r]))
preds = np.asarray(preds)
print(preds.shape)

(51L,)
(1L, 3068L, 2L)


In [201]:
# calculate weight for each tree
weights = (choices/2)-14
print(np.sum(weights))
print(weights)
# take weighted average of predictions
# m = np.average(preds, axis=0, weights=np.repeat(weights, ensemples_per_feature_space, axis=0))
m = preds[-1]
predictions = (m[:,0]<m[:,1]).astype(np.int32)
print_metrics(test_labels, predictions)

29
[ 1  3  6  8 11]
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      0.99      1.00      2848
          1       0.91      0.97      0.94       201

avg / total       0.99      0.99      0.99      3049

Confusion Matrix:


Unnamed: 0,0,1
0,2828,20
1,6,195


Matthews Correlation Coefficient: 0.933538703002
--------------------------------------------------------------


In [137]:
# """ ExtraTreeClassifier """
# from sklearn.tree import ExtraTreeClassifier

# # ratio = float(len(train[train[:,-1]=='False']))/len(train)
# clf = ExtraTreeClassifier(max_depth=70)
# %time t = clf.fit(train_data, train_labels)

# print_metrics(train_labels, clf.predict(train_data))
# print_metrics(test_labels, clf.predict(test_data))

In [11]:
""" Random Forest """
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,  class_weight={0:1.0, 1:0.05}, max_depth=90, n_jobs=4)

%time clf = clf.fit(train_data, train_labels)
print_metrics(train_labels, clf.predict(train_data))
print_metrics(test_labels, clf.predict(test_data))

Wall time: 4.22 s
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       0.99      1.00      1.00      5000
          1       1.00      0.97      0.99      1588

avg / total       0.99      0.99      0.99      6588

Confusion Matrix:


Unnamed: 0,0,1
0,5000,0
1,42,1546


Matthews Correlation Coefficient: 0.982569043431
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      0.99      0.99      2956
          1       0.89      0.94      0.92       197

avg / total       0.99      0.99      0.99      3153

Confusion Matrix:


Unnamed: 0,0,1
0,2934,22
1,11,186


Matthews Correlation Coefficient: 0.913315396951
--------------------------------------------------------------


In [17]:
""" Fill remaining features with means """
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,  class_weight={0:1.0, 1:0.05}, max_depth=90, n_jobs=4)

%time clf = clf.fit(train_data, train_labels)
print_metrics(train_labels, clf.predict(train_data))
print_metrics(test_labels, clf.predict(test_data))

f = np.where(test_labels==0)[0]
partial_test = copy.deepcopy(test_data[f])

for r in partial_test:
    r[30:] = means[30:]
print_metrics(test_labels[f], clf.predict(partial_test))

Wall time: 4.16 s
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5000
          1       1.00      0.99      1.00      1605

avg / total       1.00      1.00      1.00      6605

Confusion Matrix:


Unnamed: 0,0,1
0,5000,0
1,15,1590


Matthews Correlation Coefficient: 0.993826510117
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      0.99      1.00      2877
          1       0.92      0.99      0.95       180

avg / total       0.99      0.99      0.99      3057

Confusion Matrix:


Unnamed: 0,0,1
0,2861,16
1,1,179


Matthews Correlation Coefficient: 0.952574641858
--------------------------------------------------------------
------------------------- EVALUATION -------------------------
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      2877
          1       0.00      0.00      0.00         0

avg / total       1.00      1.00      1.00      2877

Confusion Matrix:


Unnamed: 0,0,1
0,2868,9
1,0,0


Matthews Correlation Coefficient: 0.0
--------------------------------------------------------------


In [12]:
feature_data = copy.deepcopy(means)
line = '{"ResultValue": {"total": 7, "measurements": {"bn": "SMTLine/B202/P29063050/", "e": [{"v": 100, "u": "mm", "t": 1481017764434, "n": "ScreenPrinter/PositionX"}, {"v": 200.15, "u": "mm", "t": 1481017764450, "n": "ScreenPrinter/PositionY"}, {"v": 42.92, "u": "mm", "t": 1481017776092, "n": "PasteInspection/PosX1"}, {"v": 21.95, "u": "mm", "t": 1481017776108, "n": "PasteInspection/PosY1"}, {"v": 31.84, "u": "mm", "t": 1481017776109, "n": "PasteInspection/PosX2"}, {"v": 54.96, "u": "mm", "t": 1481017776109, "n": "PasteInspection/PosY2"}, {"v": 20.92, "u": "mm", "t": 1481017776109, "n": "PasteInspection/PosX3"}]}, "type": {"bt": 2297529600, "bn": "SMTLine/B202/P29063050/", "e": [{"t": 1481017712576, "sv": "ABU2", "n": "Source/ProdType"}]}}}'
features = SensorThings2Dict(line, complete=False)
# convert measurements to numpy array
r = np.asarray(features.values())[2:-1].astype(np.float32)
# fill nans with global means
w = np.where(np.isnan(r))
r[w] = means[w]
clf.predict(r)



array([1])

In [23]:
"ScreenPrinter/PositionX".partition('/')[0]

'ScreenPrinter'