# Predicción de anomalías

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')

## Configuración del dataset: balance adecuado entre las dos clases

In [2]:
df = pd.read_pickle('sampledata2.pkl')

# Feature 'Problem': por cada día con 'ProblemReported' = 1, etiquetamos dicho día y los 2 días anteriores.
for device in df.DeviceID.unique():
    reported_array = df.loc[(df.DeviceID == device), 'ProblemReported'].values
    reported_array = reported_array[::-1]             # Recorreremos el vector al contrario, facilitando la implementación
    problem_array = np.zeros(len(reported_array))
    
    for i in range(len(reported_array)):
        if reported_array[i] == 1:
            
            for j in range(i, i + 3):             # En adelante al problema reportado, etiquetamos 2 días adicionales
                if j < len(reported_array):           # Evitamos el acceso a un índice fuera de los límites del array
                    problem_array[j] = 1
    
    problem_array = problem_array[::-1]
    df.loc[df.DeviceID == device, 'Problem'] = problem_array

df_problem_reported = df.loc[(df.Problem == 1)]
df_no_problem_reported = df.loc[(df.Problem == 0)]

In [3]:
# Selección de features
features = ['Day', 'Month', 'Year', 'Problem']
features.extend([feature for feature in df.columns if 'Error' in feature and len(feature) >= 13])
features.extend([feature for feature in df.columns if 'Problem' in feature and len(feature) >= 26])
features.extend([feature for feature in df.columns if 'Fault' in feature and len(feature) >= 29])
features.extend([feature for feature in df.columns if 'Warning_Type' in feature and len(feature) >= 26])
features.extend([feature for feature in df.columns if 'AE' in feature])
features.extend([feature for feature in df.columns if 'Grouped' in feature])

# Reducción de registros para mantener un balance 90-10 (10% de anomalías, 90% sin anomalías) entre clases
np.random.seed(2019)
remove_n = df_no_problem_reported.shape[0] - (df_problem_reported.shape[0] * 9) # Cálculo del número de filas a borrar
drop_indices = np.random.choice(df_no_problem_reported.index, remove_n, replace=False)
df_no_problem_reported = df_no_problem_reported.drop(drop_indices)

# Eliminación de features innecesarias en los 2 dataframes base y creación del dataframe final
df_problem_reported.drop(columns=[feature for feature in df_problem_reported.columns if not feature in features], inplace=True)
df_no_problem_reported.drop(columns=[feature for feature in df_no_problem_reported.columns if not feature in features], inplace=True)
df_B = pd.concat([df_no_problem_reported, df_problem_reported], ignore_index=True)

df_B.shape

(117440, 135)

## Carga de datos de los 5 ficheros 'df_rolling_*d.pkl'

In [4]:
time_windows = ['3d', '7d', '15d', '30d', '90d']
rolling_files = ['./df_rolling_' + time_w + '.pkl' for time_w in time_windows]

for r_file in rolling_files:
    # Carga del fichero y retención de las features móviles (rolling)
    df_rolling = pd.read_pickle(r_file)
    df_rolling.drop(columns=[feature for feature in df_rolling.columns if not 'Roll' in feature], inplace=True)
    
    # Unión de dataframes por separado para mantener los índices anteriormente seleccionados
    df_npr_temp = pd.merge(df_no_problem_reported, df_rolling, how='inner', left_index=True, right_index=True)
    df_pr_temp = pd.merge(df_problem_reported, df_rolling, how='inner', left_index=True, right_index=True)
    
    # Retención de las features móviles (nueva operación 'drop', necesaria para evitar el mensaje de 'Memory Error')
    df_npr_temp.drop(columns=[feature for feature in df_npr_temp.columns if not 'Roll' in feature], inplace=True)
    df_pr_temp.drop(columns=[feature for feature in df_pr_temp.columns if not 'Roll' in feature], inplace=True)
    
    # Unión de los 2 dataframes creados en esta iteración y concatenación con el dataframe final
    df_rolling = pd.concat([df_npr_temp, df_pr_temp], ignore_index=True)
    df_B = pd.concat([df_B, df_rolling], axis=1)

df_B.shape

(117440, 1095)

#### Guardado del dataset extendido (features categóricas y numéricas) para predicción de anomalías mediante clasificadores

In [5]:
import pickle
import os

try:
    os.mkdir('Anomaly Prediction')
except:
    None

df_B.to_pickle('./Anomaly Prediction/cls_balanced_extended_dataset.pkl')

#### Cargado del dataset extendido y ejecución de una batería de modelos de clasificación

In [6]:
df_B = pd.read_pickle('./Anomaly Prediction/cls_balanced_extended_dataset.pkl')

from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier

X = df_B[[feature for feature in df_B.columns if not feature == 'Problem']].values
y = df_B['Problem'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019, shuffle=True, stratify=y)
tpot = TPOTClassifier(verbosity=3, scoring='accuracy', random_state=2019, periodic_checkpoint_folder='tpot_models', 
                      n_jobs=-1, generations=5, population_size=10, cv=3)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

tpot.export('./Anomaly Prediction/best_extended_model_code.py')

from sklearn.metrics import classification_report

y_pred = tpot.predict(X_test)

print(classification_report(y_test, y_pred))

30 operators have been imported by TPOT.
Imputing missing values in feature set


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=60, style=ProgressStyle(descripti…

Skipped pipeline #1 due to time out. Continuing to the next pipeline.
Skipped pipeline #3 due to time out. Continuing to the next pipeline.
Skipped pipeline #6 due to time out. Continuing to the next pipeline.
Skipped pipeline #9 due to time out. Continuing to the next pipeline.
Skipped pipeline #11 due to time out. Continuing to the next pipeline.
Skipped pipeline #15 due to time out. Continuing to the next pipeline.
Saving periodic pipeline from pareto front to tpot_models\pipeline_gen_1_idx_0_2019.05.19_11-42-17.py
Skipped pipeline #18 due to time out. Continuing to the next pipeline.
Skipped pipeline #21 due to time out. Continuing to the next pipeline.
Skipped pipeline #23 due to time out. Continuing to the next pipeline.
Skipped pipeline #25 due to time out. Continuing to the next pipeline.
Skipped pipeline #27 due to time out. Continuing to the next pipeline.
Skipped pipeline #29 due to time out. Continuing to the next pipeline.
Skipped pipeline #32 due to time out. Continuing t