# Predicción de anomalías

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')

from keras.layers.core import Dense 
from keras.models import Model, Sequential
from keras import regularizers

Using TensorFlow backend.


## Configuración del dataset

#### Balance adecuado entre las dos clases

In [2]:
df = pd.read_pickle('sampledata2.pkl')

# Feature 'Problem': por cada día con 'ProblemReported' = 1, etiquetamos dicho día y los 2 días anteriores.
for device in df.DeviceID.unique():
    reported_array = df.loc[(df.DeviceID == device), 'ProblemReported'].values
    reported_array = reported_array[::-1]             # Recorreremos el vector al contrario, facilitando la implementación
    problem_array = np.zeros(len(reported_array))
    
    for i in range(len(reported_array)):
        if reported_array[i] == 1:
            
            for j in range(i, i + 3):             # En adelante al problema reportado, etiquetamos 2 días adicionales
                if j < len(reported_array):           # Evitamos el acceso a un índice fuera de los límites del array
                    problem_array[j] = 1
    
    problem_array = problem_array[::-1]
    df.loc[df.DeviceID == device, 'Problem'] = problem_array

df_problem_reported = df.loc[(df.Problem == 1)]
df_no_problem_reported = df.loc[(df.Problem == 0)]

#### Selección de features

In [3]:
features = ['Day', 'Month', 'Year', 'Problem']
features.extend([feature for feature in df.columns if 'Error' in feature and len(feature) == 15])
features.extend([feature for feature in df.columns if 'Grouped' in feature])

#### B) Dataset para predicción de anomalías

In [4]:
# Reducción de registros para mantener un balance 90-10 (10% de anomalías, 90% sin anomalías) entre clases
np.random.seed(2019)
remove_n = df_no_problem_reported.shape[0] - (df_problem_reported.shape[0] * 9) # Cálculo del número de filas a borrar
drop_indices = np.random.choice(df_no_problem_reported.index, remove_n, replace=False)
df_no_problem_reported = df_no_problem_reported.drop(drop_indices)

df_B = pd.concat([df_no_problem_reported, df_problem_reported], ignore_index=True)
df_B.drop(columns=[feature for feature in df_B.columns if not feature in features], inplace=True)

df_B.shape

(117440, 87)

#### Guardado del dataset para predicción de anomalías mediante modelos de clasificación sobre variables categóricas

In [5]:
import pickle
import os

try:
    os.mkdir('Anomaly Prediction')
except:
    None

df_B.to_pickle('./Anomaly Prediction/cls_balanced_dataset.pkl')

#### Ejecución de una batería de modelos de clasificación

In [6]:
df_B = pd.read_pickle('./Anomaly Prediction/cls_balanced_dataset.pkl')

In [7]:
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier

X = df_B.iloc[:, :-1].values
y = df_B.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019, shuffle=True, stratify=y)
tpot = TPOTClassifier(verbosity=3, scoring='accuracy', random_state=2019, periodic_checkpoint_folder='tpot_models', 
                      n_jobs=-1, generations=5, population_size=10, cv=3)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

tpot.export('./Anomaly Prediction/best_model_code.py')

30 operators have been imported by TPOT.


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=60, style=ProgressStyle(descripti…

Saving periodic pipeline from pareto front to tpot_models\pipeline_gen_1_idx_0_2019.05.19_01-46-50.py
Skipped pipeline #12 due to time out. Continuing to the next pipeline.
Skipped pipeline #17 due to time out. Continuing to the next pipeline.
Skipped pipeline #20 due to time out. Continuing to the next pipeline.
Skipped pipeline #23 due to time out. Continuing to the next pipeline.
Generation 1 - Current Pareto front scores:
-1	0.907050408719346	LogisticRegression(input_matrix, LogisticRegression__C=0.5, LogisticRegression__dual=False, LogisticRegression__penalty=l2)

Periodic pipeline was not saved, probably saved before...
Skipped pipeline #25 due to time out. Continuing to the next pipeline.
Skipped pipeline #30 due to time out. Continuing to the next pipeline.
Skipped pipeline #35 due to time out. Continuing to the next pipeline.
Generation 2 - Current Pareto front scores:
-1	0.907050408719346	LogisticRegression(input_matrix, LogisticRegression__C=0.5, LogisticRegression__dual=Fal

#### Predicción del conjunto de test y obtención de indicadores

In [8]:
from sklearn.metrics import classification_report

y_pred = tpot.predict(X_test)

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        0.0       0.91      0.99      0.95     26424
        1.0       0.66      0.14      0.23      2936

avg / total       0.89      0.91      0.88     29360

