# Initialization

Test notebook for the damadics benchmark. Approach using ANN (MLP). 

First we import the necessary packages and create the global variables.

In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.font_manager
import matplotlib
import seaborn as sns
import pandas as pd
import logging
import random
import plottingTools
import TF_MLP
import utils
import tensorflow as tf
from datetime import datetime
from sklearn.covariance import EllipticEnvelope
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from mpl_toolkits.mplot3d import Axes3D
from dataManagement import DataManagerDamadics

from IPython.display import display, HTML
%matplotlib notebook

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("damadics")
logger.setLevel(logging.DEBUG)
handler = logging.FileHandler("DataManager.log")
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

random_seed = 0 #Change this to make it really random, 0 for testing purposes
random.seed(random_seed)

columnArrangement = ['id', 'selectedFault', 'faultType', 'faultIntensity', 'externalControllerOutput', 'disturbedMediumFlow', 
                        'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 'rodDisplacement']

desiredComponents = ["Valve"]
dataManager = DataManagerDamadics(user="readOnly", password="readOnly", engineType="mysql+mysqldb://", 
                                  dbName="damadics2", host="localhost", port="3306")

DEBUG:damadics.dataManagement:Connection to mysql+mysqldb://localhost:3306/damadics2 successfull


# Retrieve data

We retrieve the data needed for our models. We specify the start and end dates for the data we want to use.

In [2]:
startDateTime = datetime(2017, 12, 30, hour=0, minute=0, second=0, microsecond=0)
endDateTime = datetime(2018, 1, 28, hour=0, minute=0, second=0, microsecond=0)

train_size = 0.9
test_size = 1-train_size

#0 means normal and 1 means fault
X, y, df = dataManager.retrieve_and_reshape_data(startDateTime, endDateTime, desiredComponents, columnArrangement)

#Standardize the data
X_transformed = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, train_size = train_size, 
                                                    test_size = test_size, random_state=random_seed)

#Generate useful counts
n_m, n_x = X.shape #Number of samples, features
train_n_m, train_n_x = X_train.shape #Number of samples, features for training
test_n_m, test_n_x = X_test.shape #Number of samples, features for testing
fault_count = np.count_nonzero(y)
non_fault_count = n_m - fault_count
fault_nofault_Ratio = fault_count/non_fault_count

print('\nTotal sample size {}, Train Size {} ({}%), Test Size {} ({}%)'.format(n_m, X_train.shape[0], np.round(train_size*100),  
                                                                               X_test.shape[0], np.round(test_size*100)))

print('Total sample size {}, Faulty samples {}, Normal samples {}, Fault/Non Fault Ratio {:.4f}'.
      format(n_m, fault_count, non_fault_count, fault_nofault_Ratio))

DEBUG:damadics.dataManagement:Loading data for components [<class 'damadicsDBMapping.ValveReading'>] from 2017-12-30 00:00:00 to 2018-01-28 00:00:00



Total sample size 41121, Train Size 37008 (90.0%), Test Size 4113 (10.0%)
Total sample size 41121, Faulty samples 6256, Normal samples 34865, Fault/Non Fault Ratio 0.1794


# SKLearn implementation

Now that I have the data, i'll try various implementations. Lets start with SCIKIT

In [3]:
#using adam optimizer for applying minibatches
mlp_clf = MLPClassifier(hidden_layer_sizes=(10, 10, 5), activation='relu', solver='adam', alpha=1e-2, batch_size=512, 
                       learning_rate='constant', learning_rate_init=1e-2)

#Fit the MLP to the data.
y_train_sk = y_train.ravel()
y_test_sk = y_test.ravel()

mlp_clf.fit(X_train, y_train_sk)

print('Number of iterations: {}'.format(mlp_clf.n_iter_))
print(mlp_clf.loss_)

y_pred_train = mlp_clf.predict(X_train)
y_pred_test = mlp_clf.predict(X_test)

#Compute metrics
score_train = mlp_clf.score(X_train, y_train_sk)
score_test = mlp_clf.score(X_test, y_test_sk)
precision_train = precision_score(y_train_sk, y_pred_train)
precision_test = precision_score(y_test_sk, y_pred_test)
recall_train = recall_score(y_train_sk, y_pred_train)
recall_test = recall_score(y_test_sk, y_pred_test)
f1_train = f1_score(y_train_sk, y_pred_train)
f1_test = f1_score(y_test_sk, y_pred_test)

print("Mean train accuracy {}\t Mean test accuracy {}".format(score_train, score_test))
print("Precision (train) {}\t Precision (test) {}".format(precision_train, precision_test))
print("Recall (train) {}\t Recall (test) {}".format(recall_train, recall_test))
print("F1 (train) {}\t F1 (test) {}".format(f1_train, f1_test))




Number of iterations: 20
0.196303201482
Mean train accuracy 0.9296098140942499	 Mean test accuracy 0.9316800389010454
Precision (train) 0.9274992919852733	 Precision (test) 0.9312039312039312
Recall (train) 0.582325746799431	 Recall (test) 0.5996835443037974
F1 (train) 0.7154560349535773	 F1 (test) 0.7295476419634264


# TF implementation

Lets now try with TensorFlow

In [3]:
tf.reset_default_graph()
tfmlp_clf = TF_MLP.MLPClassifier(hidden_layer_sizes=(10,10,5))
X_tf = X.T
y_tf = y.T
#y_tf = tf.cast(y.T, tf.float32)
tfmlp_clf.fit(X_tf, y_tf)

print("Finished training")

#W1 = tf.get_variable("W1", [25, 12288], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
#print(W1)

{}
{}
{'W1': <tf.Variable 'W1:0' shape=(10, 6) dtype=float32_ref>, 'W2': <tf.Variable 'W2:0' shape=(10, 10) dtype=float32_ref>, 'W3': <tf.Variable 'W3:0' shape=(5, 10) dtype=float32_ref>, 'W4': <tf.Variable 'W4:0' shape=(1, 5) dtype=float32_ref>}
{'b1': <tf.Variable 'b1:0' shape=(10, 1) dtype=float32_ref>, 'b2': <tf.Variable 'b2:0' shape=(10, 1) dtype=float32_ref>, 'b3': <tf.Variable 'b3:0' shape=(5, 1) dtype=float32_ref>, 'b4': <tf.Variable 'b4:0' shape=(1, 1) dtype=float32_ref>}
Cost after epoch 0: 0.494060
Cost after epoch 100: 0.260074
Finished training


In [4]:
X_tf = X.T
y_tf = y.T

print(X_tf.shape)
print(y_tf.shape)

minibatches = utils.random_mini_batches(X_tf, y_tf, 256, 1)

for minibatch in minibatches:
    (mini_X, mini_y) = minibatch
    print(mini_X.shape)
    print(mini_y.shape)

(6, 41121)
(1, 41121)
(6, 41121)
(1, 41121)
[[ 1.        1.        0.222181 ...,  0.794534  0.147993  0.876782]
 [ 0.87684   0.850576  0.849537 ...,  0.848818  0.847772  0.847832]
 [ 0.650832  0.643717  0.647472 ...,  0.644823  0.64161   0.643922]
 [ 0.214553  0.215373  0.214781 ...,  0.213406  0.216033  0.218695]
 [ 0.00159   0.369939  0.732446 ...,  0.47581   0.81031   0.458222]
 [ 0.5       0.366043  0.732444 ...,  0.257854  0.659356  0.484302]]
[[0 0 0 ..., 0 0 0]]
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 256)
(1, 256)
(6, 