In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score
from scipy.stats import ks_2samp
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
%matplotlib inline

Using TensorFlow backend.


In [7]:
X_train = pd.read_csv('input/X_train_res.csv').drop(columns=['Unnamed: 0'])
y_train = pd.read_csv('input/y_smote.csv').drop(columns=['Unnamed: 0'])
X_val = pd.read_csv('input/X_val.csv').drop(columns=['INDEX'])
y_val = pd.read_csv('input/y_val.csv', names=['INDEX', 'IND_BOM_1_1']).drop(columns=['INDEX'])
X_test = pd.read_csv('input/X_test.csv').drop(columns=['INDEX'])
y_test = pd.read_csv('input/y_test.csv', names=['INDEX', 'IND_BOM_1_1']).drop(columns=['INDEX'])

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [9]:
input_dim = len(list(X_train_res)) # Dimensão de entrada do dataset

'''
    * Método para calcular a distribuição acumulada das classes, de 0.0 a 1.0
    * Utilizado para o cálculo do KS
'''
def calc_distr(y_true:pd.DataFrame, y_pred_proba:np.array):
    ac_distr_0 = np.zeros(101)
    ac_distr_1 = np.zeros(101)
    count_classes = y_true['IND_BOM_1_1'].value_counts()
    for i in range(1, 101):
        lim = i/100.0
        ac_classes = y_true[y_pred_proba <= lim]['IND_BOM_1_1'].value_counts()
        ac_distr_0[i] += ac_classes.get(0, 0) 
        ac_distr_1[i] += ac_classes.get(1, 0)
    return (ac_distr_0/count_classes[0], ac_distr_1/count_classes[1])

In [10]:
'''
    * Método para calcular as métricas a serem impressas
'''

def calc_metrics(y_true, y_pred, y_pred_proba):
    cm = confusion_matrix(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    metrics = {
        'cm': cm,
        'precision': precision,
        'accuracy': accuracy,
        'f1': f1,
        'recall': recall
    }
    if not y_pred_proba is None:
        ac_distr0, ac_distr1 = calc_distr(y_true, y_pred_proba)
        ks = ks_2samp(ac_distr0, ac_distr1)
        auroc = roc_auc_score(y_true, y_pred_proba)
        aps = average_precision_score(y_true, y_pred_proba)
        metrics['ks'] = ks[0]
        metrics['auroc'] = auroc
        metrics['aps'] = aps
    return metrics

In [11]:
'''
    * Método para imprimir todas as métricas
'''
def print_metrics(metrics:dict):
    print('Matriz de Confusão:', end='\n\n')
    print(pd.DataFrame(metrics['cm'], columns=['T', 'F'], index=['T', 'F']), end='\n\n')
    print('Área Sob Curva ROC: %.5f'%(metrics['auroc']), end = '\n\n')
    print('KS-Score: %.5f'%(metrics['ks']), end='\n\n')
    print('Precisão Média de Previsão: %.5f'%(metrics['aps']), end='\n\n')
    print('Precisão: %.5f'%(metrics['precision']), end='\n\n')
    print('Acurácia: %.5f'%(metrics['accuracy']), end='\n\n')
    print('Recall: %.5f'%(metrics['recall']), end='\n\n')
    print('F1-Score: %.5f'%(metrics['f1']), end='\n\n')

In [24]:
mlp = Sequential()
mlp.add(Dense(20, activation='tanh', input_dim=input_dim)) # Camada de entrada
mlp.add(Dense(1, activation='sigmoid')) # Camada de saída
mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = mlp.fit(X_train_res, y_train_res, batch_size=64, epochs=100000, 
        callbacks=[EarlyStopping(patience=3)], validation_data=(X_val, y_val[1:]));
y_pred = mlp.predict_classes(X_test)
y_pred_proba = mlp.predict(X_test)

Train on 307646 samples, validate on 26077 samples
Epoch 1/100000
Epoch 2/100000
Epoch 3/100000
Epoch 4/100000


In [25]:
metrics = calc_metrics(y_test, y_pred, y_pred_proba)
print_metrics(metrics)

Matriz de Confusão:

       T      F
T  15604  28648
F  17182  67001

Área Sob Curva ROC: 0.62004

KS-Score: 0.25743

Precisão Média de Previsão: 0.74326

Precisão: 0.70049

Acurácia: 0.64317

Recall: 0.79590

F1-Score: 0.74515



In [26]:
mlp = Sequential()
mlp.add(Dense(40, activation='tanh', input_dim=input_dim)) # Camada de entrada
mlp.add(Dense(1, activation='sigmoid')) # Camada de saída
mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = mlp.fit(X_train_res, y_train_res, batch_size=64, epochs=100000, 
        callbacks=[EarlyStopping(patience=3)], validation_data=(X_val, y_val[1:]))
y_pred = mlp.predict_classes(X_test)
y_pred_proba = mlp.predict(X_test)

Train on 307646 samples, validate on 26077 samples
Epoch 1/100000
Epoch 2/100000
Epoch 3/100000
Epoch 4/100000


In [27]:
metrics = calc_metrics(y_test, y_pred, y_pred_proba)
print_metrics(metrics)

Matriz de Confusão:

       T      F
T  15676  28576
F  17766  66417

Área Sob Curva ROC: 0.61786

KS-Score: 0.28713

Precisão Média de Previsão: 0.74139

Precisão: 0.69918

Acurácia: 0.63918

Recall: 0.78896

F1-Score: 0.74136



In [30]:
hidden_nodes = 30

mlp = Sequential()
mlp.add(Dense(40, activation='sigmoid', input_dim=input_dim)) # Camada de entrada
for i in range(5):
    mlp.add(Dense(hidden_nodes, activation='relu', input_dim=input_dim))
mlp.add(Dense(1, activation='sigmoid')) # Camada de saída
mlp.compile(optimizer='adam', loss='mean_squared_logarithmic_error', metrics=['acc'])
history = mlp.fit(X_train_res, y_train_res, batch_size=64, epochs=100000, 
        callbacks=[EarlyStopping(patience=3)], validation_data=(X_val, y_val[1:]))
y_pred = mlp.predict_classes(X_test)
y_pred_proba = mlp.predict(X_test)
metrics = calc_metrics(y_test, y_pred, y_pred_proba)
print_metrics(metrics)

Train on 307646 samples, validate on 26077 samples
Epoch 1/100000
Epoch 2/100000
Epoch 3/100000
Epoch 4/100000
Matriz de Confusão:

      T      F
T  2564  41688
F  1590  82593

Área Sob Curva ROC: 0.60856

KS-Score: 0.32673

Precisão Média de Previsão: 0.73663

Precisão: 0.66457

Acurácia: 0.66304

Recall: 0.98111

F1-Score: 0.79240

