In [1]:
import numpy as np
import pandas as pd
import math
from datetime import datetime
import CMAPSAuxFunctions

from data_handler_VALVE import ValveDataHandler
from tunable_model import SequenceTunableModelRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics

from keras.models import Sequential, Model
from keras.layers import Dense, Input, Dropout, Reshape, Conv2D, Flatten, MaxPooling2D
from keras.optimizers import Adam, SGD
from keras.callbacks import LearningRateScheduler
from keras import backend as K
from keras import regularizers

Using TensorFlow backend.


<h1> Create Data Handler </h1>

In [2]:
features = ['timestamp', 'externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 'rodDisplacement', 'disturbedMediumFlow', 'selectedFault', 'faultType', 'faultIntensity']
selected_indices = np.array([2,3,4,5,6,7])
selected_features = list(features[i] for i in selected_indices-1)

window_size = 30
window_stride = 1

# min = 2018-02-14 18:59:20
# max = 2018-08-19 18:28:20
time_start = "2018-02-14 18:59:20"
time_end = "2018-04-19 18:28:20"

# Either anomaly, classification or regression
problem = 'anomaly'
#problem = 'classification'
#problem = 'regression'

vHandler = ValveDataHandler(time_start, time_end, selected_features = selected_features,
                            sequence_length = window_size, sequence_stride = window_stride,
                            problem = problem)

init


In [3]:
print(selected_features)

['externalControllerOutput', 'pressureValveInlet', 'pressureValveOutlet', 'mediumTemperature', 'rodDisplacement', 'disturbedMediumFlow']


<h1> Keras Model </h1>

In [4]:
#np.set_printoptions(threshold=np.nan)

K.clear_session()
lambda_regularization = 0.20

def create_ANN_model(input_shape, problem):
    
    #Create a sequential model
    model = Sequential()
    
    #Add the layers for the model
    model.add(Dense(20, input_dim = input_shape, activation = 'relu', kernel_initializer = 'glorot_normal', 
                    kernel_regularizer = regularizers.l2(lambda_regularization), name = 'fc1'))
    
    if (problem == 'classification'):
        model.add(Dense(20, activation = 'softmax', name = 'out'))
    elif (problem == 'regression'):
        model.add(Dense(1, activation = 'linear', name = 'out'))
        
    #model.add(Dense(1, activation = 'softmax', name = 'out'))
    
    return model

<h1>Tunable Model </h1>

In [5]:
#scaler = MinMaxScaler(feature_range = (-1, 1))
scaler = StandardScaler()

In [6]:
def get_compiled_model(shape, problem):
    
    K.clear_session()
    
    if (problem == 'classification' or problem == 'anomaly'):
        # Parameters for the model
        # Default:  optimizer = SGD(lr = 0.01, momentum = 0.0, decay = 0.0, nesterov = False)
        optimizer = SGD(lr = 0.01, momentum = 0.0, decay = 0.0, nesterov = False)
        loss_function = 'categorical_crossentropy'
        metrics = ['accuracy']
    elif (problem == 'regression'):
        # Parameters for the model
        # Default: optimizer = Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = None, decay = 0.0, amsgrad = False)
        optimizer = Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = None, decay = 0.0, amsgrad = False)
        loss_function = 'mean_squared_error'
        metrics = ['mse']
        
    model = None
    
    # Create and compile the model
    model = create_ANN_model(shape, problem)
    model.compile(optimizer = optimizer, loss = loss_function, metrics = metrics)
    
    return model

In [7]:
num_features = len(selected_features)
input_shape = num_features * window_size

model = get_compiled_model(input_shape, problem)
tModel = SequenceTunableModelRegression('ANN_Model', model, lib_type = 'keras', data_handler = vHandler, data_scaler = scaler)

<h1> Loading Data from MySQL Database </h1>

In [8]:
#vHandler.connect_to_db("remoteAdmin","remoteAdmin","169.236.181.40:3306","damadics")

In [9]:
# Extract data from database
#vHandler.extract_data_from_db()
# vHandler.extract_data_from_db()

<h1> Loading Data from Local .csv File </h1>

In [10]:
# Extract data from .csv file. If loading from .csv, don't forget to uncomment .extract_data_from_db() in data_handler_VALVE'''
vHandler._df = pd.read_csv('valve_dataset.csv', sep = ',')

vHandler._df['status'] = vHandler._df['selectedFault'].apply(lambda valve: 0 if valve == 20 else 1)

vHandler._X = vHandler._df[selected_features].values
vHandler._y = vHandler._df['status'].values

In [11]:
print(vHandler._X)
print(vHandler._y)

[[5.00000e-01 8.76840e-01 6.50832e-01 2.14553e-01 1.59000e-03 1.00000e+00]
 [3.66043e-01 8.50576e-01 6.43717e-01 2.15373e-01 3.69939e-01 1.00000e+00]
 [7.32444e-01 8.49537e-01 6.47472e-01 2.14781e-01 7.32446e-01 2.22181e-01]
 ...
 [7.32444e-01 8.47251e-01 6.43362e-01 2.14299e-01 1.00000e+00 0.00000e+00]
 [2.57854e-01 8.49342e-01 6.43664e-01 2.15743e-01 1.00000e+00 3.34000e-04]
 [6.59356e-01 8.49701e-01 6.43186e-01 2.17503e-01 1.00000e+00 0.00000e+00]]
[0 0 0 ... 1 1 1]


In [12]:
#vHandler.load_data(cross_validation_ratio = 0.3, test_ratio = 0.2, unroll = True)
tModel.load_data(unroll = True, verbose = 0, cross_validation_ratio = 0.3, test_ratio = 0.05)

Loading data from database
Splitting into samples: 0:00:01.016283
counter: 130
Splitting into samples: 0:00:01.196913
counter: 130
Number of defective valves in cross-validation set: 9 out of 39.
Number of defective valves in test set: 3 out of 6.

Train, cv, and test splitting: 0:00:00.000700
Sequence length 30
Sequence stride 1
X_train len 85
X_crossVal len 39
X_test len 6
y_train len 85
y_crossVal len 39
y_test len 6
X_train[0]
(1810, 6)
[[7.32444e-01 8.47430e-01 6.53217e-01 2.15560e-01 9.99679e-01 1.64200e-03]
 [2.57854e-01 8.46583e-01 6.50053e-01 2.16170e-01 9.98971e-01 5.00000e-04]
 [6.59356e-01 8.48463e-01 6.43613e-01 2.16509e-01 9.99647e-01 0.00000e+00]
 ...
 [2.57854e-01 8.52949e-01 6.50309e-01 2.16388e-01 9.97628e-01 0.00000e+00]
 [6.59356e-01 8.47833e-01 6.48353e-01 2.14150e-01 1.00000e+00 1.06800e-03]
 [4.84302e-01 8.48866e-01 6.42545e-01 2.16078e-01 1.00000e+00 0.00000e+00]]
X_crossVal[0]
(30, 6)
[[3.66043e-01 8.50339e-01 6.54880e-01 2.15307e-01 9.96987e-01 0.00000e+00]
 [

In [13]:
vHandler.print_data(True)

Printing shapes

Training data (X, y)
(163361, 180)
(163361, 1)
Cross-Validation data (X, y)
(39, 180)
(39, 1)
Testing data (X, y)
(6, 180)
(6, 1)
Printing first 5 elements

Training data (X, y)
[[7.32444e-01 8.47430e-01 6.53217e-01 2.15560e-01 9.99679e-01 1.64200e-03
  2.57854e-01 8.46583e-01 6.50053e-01 2.16170e-01 9.98971e-01 5.00000e-04
  6.59356e-01 8.48463e-01 6.43613e-01 2.16509e-01 9.99647e-01 0.00000e+00
  4.84302e-01 8.48463e-01 6.42413e-01 2.14941e-01 9.97689e-01 0.00000e+00
  3.66043e-01 8.49465e-01 6.43234e-01 2.16185e-01 1.00000e+00 0.00000e+00
  7.32444e-01 8.52317e-01 6.48973e-01 2.13273e-01 1.00000e+00 0.00000e+00
  2.57854e-01 8.51079e-01 6.51108e-01 2.17324e-01 1.00000e+00 0.00000e+00
  6.59356e-01 8.47874e-01 6.54561e-01 2.15345e-01 9.99918e-01 2.05700e-03
  4.84302e-01 8.49620e-01 6.55428e-01 2.16342e-01 9.99602e-01 2.74400e-03
  3.66043e-01 8.48794e-01 6.54442e-01 2.12651e-01 9.99141e-01 0.00000e+00
  7.32444e-01 8.48438e-01 6.53831e-01 2.14624e-01 9.97022e-01 1.8

In [14]:
y_anomaly_test = []
y_anomaly_crossVal = []
for i in tModel._y_test:
    if (i == 1):
        y_anomaly_test.append(-1)
    else:
        y_anomaly_test.append(1)
        
for i in tModel._y_crossVal:
    if (i == 1):
        y_anomaly_crossVal.append(-1)
    else:
        y_anomaly_crossVal.append(1)

In [15]:
print(np.ravel(tModel._y_crossVal))
print(np.ravel(tModel._y_test))

[0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 1. 0. 1.]


In [16]:
print(y_anomaly_crossVal)
print(y_anomaly_test)

[1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, -1, -1, 1, -1]


<h1> Anomaly Detection Algorithm Testing </h1>

In [17]:
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope

In [18]:
# Isolation Forest Default
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
# iso_forest = IsolationForest(behaviour = 'new', bootstrap = False, contamination = 'legacy',
#                              max_features = 1.0, max_samples = 'auto', n_estimators = 100,
#                              n_jobs = None, random_state = None, verbose = 0)

# Elliptic Envelope Default: 
# https://scikit-learn.org/stable/modules/generated/sklearn.covariance.EllipticEnvelope.html
# EE = EE = EllipticEnvelope(store_precision = True, assume_centered = False, support_fraction = 0.5,
#                       contamination = 0.1, random_state = None)

In [19]:
algorithms = list()

iso_forest = IsolationForest(behaviour = 'new', bootstrap = False, contamination = 'auto',
                             max_features = 1.0, max_samples = 'auto', n_estimators = 100,
                             n_jobs = None, random_state = None, verbose = 0)
algorithms.append(iso_forest)


EE = EllipticEnvelope(store_precision = True, assume_centered = False, support_fraction = 0.5,
                      contamination = 0.1, random_state = None)
algorithms.append(EE)

In [20]:
crossVal_data, test_data = list(), list()
results = list()
for algo in algorithms:
    
    start_time = datetime.now()
    algo.fit(tModel.data_handler._X_train)
    print('Time Elapsed: {}'.format(datetime.now()- start_time))
    
    y_pred_crossVal = algo.predict(tModel.data_handler._X_crossVal)
    y_pred_test = algo.predict(tModel.data_handler._X_test)
    
    results.append({
        'Cross Validation': y_pred_crossVal,
        'Test Predictions': y_pred_test
    })
    
    crossVal_data.append(({
        'Accuracy': metrics.accuracy_score(y_anomaly_crossVal, y_pred_crossVal),
        'Precision': metrics.precision_score(y_anomaly_crossVal, y_pred_crossVal),
        'Recall': metrics.recall_score(y_anomaly_crossVal, y_pred_crossVal),
    }))
    test_data.append(({
        'Accuracy': metrics.accuracy_score(y_anomaly_test, y_pred_test),
        'Precision': metrics.precision_score(y_anomaly_test, y_pred_test),
        'Recall': metrics.recall_score(y_anomaly_test, y_pred_test)
    }))
    
    print(len(crossVal_data))
    
    print('Algorithm Done')

Time Elapsed: 0:00:45.661795
1
Algorithm Done




Time Elapsed: 0:02:26.006000
2
Algorithm Done


<h1> Evaluation Metrics </h1>

In [21]:
crossVal_results = pd.DataFrame(data = crossVal_data, columns = ['Accuracy', 'Precision', 'Recall'],
                       index = ['Isolation Forest', 'Elliptic Envelope'])
test_results = pd.DataFrame(data = test_data, columns = ['Accuracy', 'Precision', 'Recall'],
                       index = ['Isolation Forest', 'Elliptic Envelope'])

In [23]:
print(crossVal_results)

                   Accuracy  Precision    Recall
Isolation Forest   0.717949   0.787879  0.866667
Elliptic Envelope  0.743590   0.794118  0.900000


In [24]:
print(test_results)

                   Accuracy  Precision  Recall
Isolation Forest        0.5        0.5     1.0
Elliptic Envelope       0.5        0.5     1.0


In [25]:
true_positives, true_negatives, false_positives, false_negatives = 0, 0, 0, 0
for i in range(len(y_anomaly_crossVal)):
    if (results[0]['Cross Validation'][i] == 1):
        if (results[0]['Cross Validation'][i] == y_anomaly_crossVal[i]):
            true_positives += 1
        else:
            false_positives += 1
    else:
        if (results[0]['Cross Validation'][i] == y_anomaly_crossVal[i]):
            true_negatives += 1
        else:
            false_negatives += 1
            
print('True Positives: {}'.format(true_positives))
print('True Negatives: {}'.format(true_negatives))
print('False Positives: {}'.format(false_positives))
print('False Negatives: {}'.format(false_negatives))

True Positives: 26
True Negatives: 2
False Positives: 7
False Negatives: 4


In [26]:
# Specificity (true negative rate): measures the proportion of actual
# negatives that are correctly identified as such
print(true_negatives / (true_negatives + false_negatives))

0.3333333333333333
