# Initialization

Test notebook for the damadics benchmark. Approach using anomaly detection techniques. 

First we import the necessary packages and create the global variables.

In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import logging
import random
import plottingTools
import sys
import datetime
import graphviz 
import datetime

sys.path.append('/media/controlslab/DATA/Projects')
#sys.path.append('/Users/davidlaredorazo/Documents/University_of_California/Research/Projects')

from sklearn.covariance import EllipticEnvelope
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.metrics import accuracy_score

from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Activation

from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns; sns.set(font_scale=1.2)

from ann_framework.data_handlers.data_handler_DAMADICS import DamadicsDataHandler

#Import the tunable model classes
from ann_framework.tunable_model.tunable_model import SequenceTunableModelClassification


#from IPython.display import display, HTML
%matplotlib notebook

Using TensorFlow backend.


## Setup some options for the test

In [2]:
random_seed = 0 #Change this to make it really random, 0 for testing purposes
random.seed(random_seed)

y_trains = {'DummyClf':None, 'EllipticEnvelope':list(), 'MLP':None}
y_tests = {'DummyClf':None, 'EllipticEnvelope':list(), 'MLP':None}

pd.options.mode.chained_assignment = None
nsamples = 1000
scoringMetrics = ['precision_macro', 'recall_macro', 'f1_macro']
cv_folds = 4

## Models

In [3]:
tree_clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=10)

# Create data hanlder and load the data

In [4]:
start_date_test = datetime.datetime(2018, 2, 14, 18, 59, 20) # ValveReadingsTest, testing
start_date_training = datetime.datetime(2019, 6, 14, 17, 6, 41) # ValveReadings, trainning
time_delta = datetime.timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=1, hours=0, weeks=0)

n = 200000

end_date_training = start_date_training + n*time_delta #get the first n instances
end_date_test = start_date_test + n*time_delta #get the first n instances

print(start_date_training)
print(end_date_training)

features = ['externalControllerOutput', 'undisturbedMediumFlow', 'pressureValveInlet', 
            'pressureValveOutlet', 'mediumTemperature', 'rodDisplacement', 'disturbedMediumFlow', 
           'selectedFault', 'faultType', 'faultIntensity']

selected_indices = np.array([1,3,4,5,6])
selected_features = list(features[i] for i in selected_indices-1)
print(selected_features)

2019-06-14 17:06:41
2019-10-31 14:26:41
['externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 'rodDisplacement']


In [5]:
#Does not work for sequence sizes larger than 1 given the way I'm generating the test data. 
#Need to properly define what the test data is going to be like.
window_size = 1
window_stride = 1

dHandlder_valve_tree = DamadicsDataHandler(selected_features, window_size, window_stride,
                                      start_date_training=start_date_training, end_date_training=end_date_training,
                                      start_date_test=start_date_test, end_date_test=end_date_test,
                                      binary_classes=True, one_hot_encode=False, samples_per_run=50)
dHandlder_valve_tree.connect_to_db('readOnly', '_readOnly2019', '169.236.181.40', 'damadics')

Connection to mysql+mysqldb://readOnly:_readOnly2019@169.236.181.40/damadics successfull


In [6]:
#scaler = MinMaxScaler(feature_range=(0, 1))

tModel = SequenceTunableModelClassification('Damadics_Tree_SK', tree_clf, lib_type='scikit', 
                                            data_handler=dHandlder_valve_tree)

#tModel.data_scaler = scaler

tModel.load_data(unroll=True, verbose=1, cross_validation_ratio=0.5, shuffle_samples=True)
tModel.print_data(print_top=True)

Loading data for the first time
Reloading data due to parameter change
Loading training data for DAMADICS with window_size of 1, stride of 1. Cros-Validation ratio 0.5
Loading data from database
Reading data from ValveReading
2019-06-14 17:06:41
2019-10-31 14:26:41
Extracting data from database runtime: 0:00:01.364530
Data Splitting: 0:00:00.000093
Loading test data for DAMADICS with window_size of 1, stride of 1
Loading data from database
Reading data from ValveReadingTest
Extracting data from database runtime: 0:00:01.235509
Printing shapes

Training data (X, y)
(94033, 5)
(94033, 1)
Cross-Validation data (X, y)
(3533, 5)
(3533, 1)
Testing data (X, y)
(7771, 5)
(7771, 1)
Printing first 5 elements

Training data (X, y)
[[0.257854 0.848348 0.653717 0.214572 0.497933]
 [0.257854 0.848305 0.656842 0.213451 0.510227]
 [0.484302 0.85034  0.640007 0.216292 0.455605]
 [0.484302 0.849299 0.647105 0.214206 0.458444]
 [0.257854 0.850806 0.649526 0.216032 0.476276]]
[[-1.]
 [-1.]
 [-1.]
 [-1.]
 

In [None]:
"""

training_set = np.concatenate((tModel.X_train, tModel.y_train), axis=1)
cv_set = np.concatenate((tModel.X_crossVal, tModel.y_crossVal), axis=1)
test_set = np.concatenate((tModel.X_test, tModel.y_test), axis=1)

np.savetxt("training_set.csv", training_set, delimiter=",")
np.savetxt("cv_set.csv", cv_set, delimiter=",")
np.savetxt("test_set.csv", test_set, delimiter=",")

"""

# Perform classification using sklearn

In [7]:
tModel.train_model(verbose=1)

In [8]:
tModel.predict_model(cross_validation=True)
tModel.evaluate_model(cross_validation=True)

print(tModel.y_predicted.shape)

predicted = tModel.y_predicted
test = tModel.y_crossVal
test = np.ravel(test)

print("On test set")
print(accuracy_score(test, predicted))

clf = tModel.model
y_train_pred = clf.predict(tModel.X_train)

train = tModel.y_train
train = np.ravel(train)

print("On train set")
print(accuracy_score(tModel.y_train, y_train_pred))

(3533,)
On test set
0.6340220775544863
On train set
0.8356534408133315


In [9]:
feature_names = features[:1] + features[2:6]
print(feature_names)

dot_data = tree.export_graphviz(clf, out_file=None,
                                feature_names=feature_names,  
                                class_names=["normal", "fault"],  
                                filled=True, rounded=True, special_characters=True)  
graph = graphviz.Source(dot_data)  
#graph 

graph.render('decision_tree_damadics', view=True)  

['externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 'rodDisplacement']


'decision_tree_damadics.pdf'

## Train with time window

Use the mean of the time window to do the training.

In [10]:
window_size = 10
window_stride = 1

dHandler_valve_window = DamadicsDataHandler(selected_features, window_size, window_stride,
                                      start_date_training=start_date_training, end_date_training=end_date_training,
                                      start_date_test=start_date_test, end_date_test=end_date_test,
                                      binary_classes=True, one_hot_encode=False, samples_per_run=50)
dHandler_valve_window.connect_to_db('readOnly', '_readOnly2019', '169.236.181.40', 'damadics')

tModel_window = SequenceTunableModelClassification('Damadics_Tree_SK', tree_clf, lib_type='scikit', 
                                            data_handler=dHandler_valve_window)

#tModel.data_scaler = scaler

tModel_window.load_data(unroll=False, verbose=1, cross_validation_ratio=0.5, shuffle_samples=True)
tModel_window.print_data(print_top=True)

Connection to mysql+mysqldb://readOnly:_readOnly2019@169.236.181.40/damadics successfull
Loading data for the first time
Reloading data due to parameter change
Loading training data for DAMADICS with window_size of 10, stride of 1. Cros-Validation ratio 0.5
Loading data from database
Reading data from ValveReading
2019-06-14 17:06:41
2019-10-31 14:26:41
Extracting data from database runtime: 0:00:01.361327
Data Splitting: 0:00:00.000096
Loading test data for DAMADICS with window_size of 10, stride of 1
Loading data from database
Reading data from ValveReadingTest
Extracting data from database runtime: 0:00:01.236513
Printing shapes

Training data (X, y)
(108682, 10, 5)
(108682, 1)
Cross-Validation data (X, y)
(3239, 10, 5)
(3239, 1)
Testing data (X, y)
(7424, 10, 5)
(7424, 1)
Printing first 5 elements

Training data (X, y)
[[[0.732444 0.849492 0.64463  0.214594 0.718367]
  [0.257854 0.851315 0.642411 0.213998 0.466731]
  [0.659356 0.849536 0.646299 0.215309 0.810019]
  [0.484302 0.8453

In [11]:
cols = ['externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 
        'rodDisplacement', 'disturbedMediumFlow']

cols = ['externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 
        'rodDisplacement']

cols_fault = cols.copy()
cols_fault.append('Fault')

X_training = tModel_window.X_train
X_training = np.array([np.mean(time_window, axis=0) for time_window in X_training])

X_crossVal = tModel_window.X_crossVal
X_crossVal = np.array([np.mean(time_window, axis=0) for time_window in X_crossVal])

X_test = tModel_window.X_test
X_test = np.array([np.mean(time_window, axis=0) for time_window in X_test])

In [12]:
print(X_training.shape)
print(X_training)

(108682, 5)
[[0.4999998 0.8499483 0.6501955 0.2146905 0.6338918]
 [0.5366399 0.8496544 0.6483056 0.2157048 0.5698853]
 [0.4999998 0.8488884 0.6501989 0.2150332 0.6220414]
 ...
 [0.4999998 0.8488954 0.6497897 0.214843  0.4942801]
 [0.4999998 0.8490379 0.6500566 0.215473  0.6337489]
 [0.4999998 0.8491946 0.6493546 0.2148851 0.6331478]]


In [13]:
tModel_window.X_train = X_training
tModel_window.X_crossVal = X_crossVal
tModel_window.X_test = X_test

In [14]:
tModel_window.train_model(verbose=1)

In [15]:
tModel_window.predict_model(cross_validation=True)
tModel_window.evaluate_model(cross_validation=True)

print(tModel_window.y_predicted.shape)

predicted = tModel_window.y_predicted
test = tModel_window.y_crossVal
test = np.ravel(test)

print("On test set")
print(accuracy_score(test, predicted))

clf = tModel_window.model
y_train_pred = clf.predict(tModel_window.X_train)

train = tModel_window.y_train
train = np.ravel(train)

print("On train set")
print(accuracy_score(tModel_window.y_train, y_train_pred))

(3239,)
On test set
0.6816918802099413
On train set
0.7827883182127675


## NN with window

In [16]:
window_size = 10
window_stride = 1
input_shape = (window_size * len(selected_features), )
# print(input_shape)

In [23]:
def model_m():
    model = Sequential()
    model.add(Dense(units=128, activation='sigmoid', input_shape = (5, )))
    model.add(Dense(units=32, activation='sigmoid'))
    model.add(Dense(units=8, activation='sigmoid'))
    model.add(Dense(units=2))
    
    return model

In [24]:
dHandler_valve_nn = DamadicsDataHandler(selected_features, window_size, window_stride,
                                      start_date_training=start_date_training, end_date_training=end_date_training,
                                      start_date_test=start_date_test, end_date_test=end_date_test,
                                      binary_classes=True, one_hot_encode=True, samples_per_run=50)
dHandler_valve_nn.connect_to_db('readOnly', '_readOnly2019', '169.236.181.40', 'damadics')

model = model_m()
optimizer = Adam(lr=0.001, beta_1=0.5)
model.compile(loss='categorical_crossentropy',optimizer=optimizer, metrics=['accuracy'])

tModel_nn = SequenceTunableModelClassification('Damadics_nn', model, lib_type='keras', 
                                            data_handler=dHandler_valve_nn)

#tModel.data_scaler = scaler

tModel_nn.load_data(unroll=False, verbose=1, cross_validation_ratio=0.5, shuffle_samples=True)
tModel_nn.print_data(print_top=True)
            

Connection to mysql+mysqldb://readOnly:_readOnly2019@169.236.181.40/damadics successfull
Loading data for the first time
Reloading data due to parameter change
Loading training data for DAMADICS with window_size of 10, stride of 1. Cros-Validation ratio 0.5
Loading data from database
Reading data from ValveReading
2019-06-14 17:06:41
2019-10-31 14:26:41
Extracting data from database runtime: 0:00:01.364161
Data Splitting: 0:00:00.000100
Loading test data for DAMADICS with window_size of 10, stride of 1
Loading data from database
Reading data from ValveReadingTest
Extracting data from database runtime: 0:00:01.248162
Printing shapes

Training data (X, y)
(96415, 10, 5)
(96415, 2)
Cross-Validation data (X, y)
(3276, 10, 5)
(3276, 2)
Testing data (X, y)
(7400, 10, 5)
(7400, 2)
Printing first 5 elements

Training data (X, y)
[[[0.366043 0.849679 0.650034 0.215014 0.668619]
  [0.732444 0.848966 0.652152 0.215697 0.743013]
  [0.257854 0.850063 0.655861 0.216233 0.50106 ]
  [0.659356 0.850119

In [25]:
cols = ['externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 
        'rodDisplacement', 'disturbedMediumFlow']

cols = ['externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 
        'rodDisplacement']

cols_fault = cols.copy()
cols_fault.append('Fault')

X_training = tModel_nn.X_train
X_training = np.array([np.mean(time_window, axis=0) for time_window in X_training])

X_crossVal = tModel_nn.X_crossVal
X_crossVal = np.array([np.mean(time_window, axis=0) for time_window in X_crossVal])

X_test = tModel_nn.X_test
X_test = np.array([np.mean(time_window, axis=0) for time_window in X_test])

In [26]:
tModel_nn.X_train = X_training
tModel_nn.X_crossVal = X_crossVal
tModel_nn.X_test = X_test

print(X_training.shape)

(96415, 5)


In [29]:
tModel_nn.train_model(verbose=1, epochs=10)

training with cv
Train on 96415 samples, validate on 3276 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250


Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250


Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250
Epoch 137/250
Epoch 138/250
Epoch 139/250
Epoch 140/250
Epoch 141/250
Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250


Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 221/250
Epoch 222/250
Epoch 223/250
Epoch 224/250
Epoch 225/250
Epoch 226/250
Epoch 227/250
Epoch 228/250
Epoch 229/250
Epoch 230/250
Epoch 231/250
Epoch 232/250
Epoch 233/250
Epoch 234/250
Epoch 235/250
Epoch 236/250
Epoch 237/250


Epoch 238/250
Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 245/250
Epoch 246/250
Epoch 247/250
Epoch 248/250
Epoch 249/250
Epoch 250/250


In [30]:
tModel_nn.predict_model(cross_validation=False)
tModel_nn.evaluate_model(cross_validation=False)

print(tModel_nn.y_predicted.shape)
print(tModel_nn.scores)
# predicted = tModel_nn.y_predicted
# test = tModel_nn.y_crossVal
# test = np.ravel(test)

# print("On test set")
# print(accuracy_score(test, predicted))

# clf = tModel_nn.model
# y_train_pred = clf.predict(tModel_nn.X_train)

# train = tModel_nn.y_train
# train = np.ravel(train)

# print("On train set")
# print(accuracy_score(tModel_nn.y_train, y_train_pred))

(7400, 2)
{'loss': 5.996367162756018, 'score_1': 0.6201351351351352}
