# Predicción de anomalías

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')
import pickle
import os

#from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split

from keras.layers.core import Dense, Dropout
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD

from sklearn.metrics import classification_report

Using TensorFlow backend.


## Carga y preparación del dataset

In [2]:
df = pd.read_pickle('./Anomaly Prediction/cls_balanced_extended_dataset.pkl')

In [3]:
# Algunos valores calculados de 'Roll_Std' son nan
df = df.fillna(0)

# Eliminación de features con valores fuera del rango [0, 1]
features = ['Day', 'Month', 'Year', 'DeviceID']
df[[feature for feature in df.columns if not feature in features]]

# Eliminación de features con desviación estándar = 0, nada útiles para modelar
no_std, low_std = {}, {}
for column in df.columns:
    try:
        if df[column].std() == 0:
            no_std[column] = df[column].std()
        
        if df[column].std() < 0.01:
            low_std[column] = df[column].std()
    except:
        None

# Dos datasets: uno con todas las features, otro con sólo aquellas features cuya desviación estándar es mayor a 0.01
df_all = df.drop([k for k in no_std.keys()], axis=1)
df_filtered = df.drop([k for k in low_std.keys()], axis=1)

In [4]:
df_all.to_pickle('./Anomaly Prediction/dataset_ready_MLP_1.pkl')
df_all.shape

(117440, 997)

In [5]:
df_filtered.to_pickle('./Anomaly Prediction/dataset_ready_MLP_2.pkl')
df_filtered.shape

(117440, 338)

## Clasificación mediante MLP: MultiLayer Perceptron (todas las features)

In [6]:
X = df_all[[feature for feature in df_all.columns if not feature == 'Problem']].values
y = df_all['Problem'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019, shuffle=True, stratify=y)

seed = 2019
np.random.seed(seed)

model = Sequential()
model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

model.fit(X_train, y_train, epochs=15, batch_size=128, validation_split=0.05)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 83676 samples, validate on 4404 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x21881eda198>

In [14]:
# Guardamos el modelo
pickle.dump(model, open('./Anomaly Prediction/MLP/MLP_model_1.sav', 'wb'))

## Clasificación mediante MLP: MultiLayer Perceptron (features filtradas)

In [15]:
X = df_filtered[[feature for feature in df_filtered.columns if not feature == 'Problem']].values
y = df_filtered['Problem'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019, shuffle=True, stratify=y)

seed = 2019
np.random.seed(seed)

model = Sequential()
model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

model.fit(X_train, y_train, epochs=15, batch_size=128, validation_split=0.05)

Train on 83676 samples, validate on 4404 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x218891bc668>

In [16]:
# Guardamos el modelo
pickle.dump(model, open('./Anomaly Prediction/MLP/MLP_model_2.sav', 'wb'))