In [1]:
import os
import sys
sys.path.append('..')

## Pré-processamento

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv(
    os.path.join('..', '..', 'data', 'diabetes.csv'),
    true_values=('Yes', 'Positive', 'Male'),
    false_values=('No', 'Negative', 'Female'),
    dtype=int
)

In [4]:
df[['Age']] = StandardScaler().fit_transform(df[['Age']])
df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,-0.661367,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,0.821362,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,-0.578993,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,-0.249498,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,0.986110,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,-0.743741,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,-0.002376,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,0.821362,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,-1.320358,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


In [5]:
X = df.drop(columns=['class']).to_numpy()
y = df['class'].to_numpy()
X.shape, y.shape

((520, 16), (520,))

## Classificação

In [6]:
import time
import random 
from sklearn.metrics import confusion_matrix
from comparison.neural_network.model import MLPClassifier

### Todas as amostras

In [19]:
def get_total_metrics(X, y, *, n_iter: int, verbose: bool = False):
    results = []
    
    t0 = time.time()
    for i in range(n_iter):
        if verbose:
            print(f'Starting iteration {i} at {time.time() - t0:.2f} seconds')
            
        random.seed(i)
        
        clf = MLPClassifier(n_layers=4, layer_size=5, learning_rate=0.3, n_generations=100)
        clf.fit(X, y)
        
        y_pred = clf.predict(X.tolist())
        ((t_n, f_p), (f_n, t_p)) = confusion_matrix(y, y_pred)
        
        results.append({
            'Confiabilidade positiva': t_p / (t_p + f_p),
            'Confiabilidade negativa': t_n / (t_n + f_n),
            'Sensibilidade': t_p / (t_p + f_n),
            'Especificidade': t_n / (f_p + t_n),
            'Acurácia total': (t_p + t_n) / (t_p + f_n + f_p + t_n),
            'Taxa de erro total' : (f_p+f_n) / (t_p + f_n + f_p + t_n)
        })
        
    if verbose:
        print(f'Finished after {time.time() - t0:.2f} seconds')
        print()
    
    return pd.DataFrame(results)

In [20]:
total_df = get_total_metrics(X, y, n_iter=10, verbose=True)
total_df

Starting iteration 0 at 0.00 seconds
Starting iteration 1 at 69.84 seconds
Starting iteration 2 at 139.41 seconds
Starting iteration 3 at 212.57 seconds
Starting iteration 4 at 288.78 seconds
Starting iteration 5 at 364.07 seconds
Starting iteration 6 at 439.31 seconds
Starting iteration 7 at 514.77 seconds
Starting iteration 8 at 589.95 seconds
Starting iteration 9 at 665.36 seconds
Finished after 740.83 seconds



Unnamed: 0,Confiabilidade positiva,Confiabilidade negativa,Sensibilidade,Especificidade,Acurácia total,Taxa de erro total
0,0.996835,0.97549,0.984375,0.995,0.988462,0.011538
1,0.984424,0.979899,0.9875,0.975,0.982692,0.017308
2,0.984424,0.979899,0.9875,0.975,0.982692,0.017308
3,0.981308,0.974874,0.984375,0.97,0.978846,0.021154
4,0.987616,0.994924,0.996875,0.98,0.990385,0.009615
5,0.981308,0.974874,0.984375,0.97,0.978846,0.021154
6,0.987461,0.975124,0.984375,0.98,0.982692,0.017308
7,0.987616,0.994924,0.996875,0.98,0.990385,0.009615
8,0.99375,0.99,0.99375,0.99,0.992308,0.007692
9,0.987616,0.994924,0.996875,0.98,0.990385,0.009615


In [24]:
total_df['Taxa de erro total'].mean() * 100, total_df['Taxa de erro total'].std() * 100

(1.4230769230769231, 0.5144205807112534)

In [None]:
total_df['Acurácia total'].mean() * 100, total_df['Acurácia total'].std() * 100

### Amostras de treino

In [13]:
from sklearn.model_selection import train_test_split

In [21]:
def get_train_metrics(X, y, *, n_iter: int, verbose: bool = False):
    random.seed(0)
    clf = MLPClassifier(n_layers=4, layer_size=5, learning_rate=0.3, n_generations=100)

    results = []
    
    t0 = time.time()
    for i in range(n_iter):
        if verbose:
            print(f'Starting iteration {i} at {time.time() - t0:.2f} seconds')
            
        X_train, _, y_train, _ = \
            train_test_split(X, y, test_size=0.3, random_state=i)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_train.tolist())
        
        ((t_n, f_p), (f_n, t_p)) = confusion_matrix(y_train, y_pred)
        results.append({
            'Confiabilidade positiva': t_p / (t_p + f_p),
            'Confiabilidade negativa': t_n / (t_n + f_n),
            'Sensibilidade': t_p / (t_p + f_n),
            'Especificidade': t_n / (f_p + t_n),
            'Acurácia total': (t_p + t_n) / (t_p + f_n + f_p + t_n),
            'Taxa de erro total' : (f_p+f_n) / (t_p + f_n + f_p + t_n)
        })
        
    if verbose:
        print(f'Finished after {time.time() - t0:.2f} seconds')
        print()
        
    return pd.DataFrame(results)

In [22]:
train_df = get_train_metrics(X, y, n_iter=10, verbose=True)
train_df

Starting iteration 0 at 0.00 seconds
Starting iteration 1 at 52.44 seconds
Starting iteration 2 at 104.32 seconds
Starting iteration 3 at 157.19 seconds
Starting iteration 4 at 210.14 seconds
Starting iteration 5 at 263.65 seconds
Starting iteration 6 at 316.75 seconds
Starting iteration 7 at 369.86 seconds
Starting iteration 8 at 422.07 seconds
Starting iteration 9 at 473.31 seconds
Finished after 523.60 seconds



Unnamed: 0,Confiabilidade positiva,Confiabilidade negativa,Sensibilidade,Especificidade,Acurácia total,Taxa de erro total
0,0.982301,0.971014,0.982301,0.971014,0.978022,0.021978
1,0.99537,0.952703,0.968468,0.992958,0.978022,0.021978
2,0.995495,0.964789,0.977876,0.992754,0.983516,0.016484
3,0.973684,0.985294,0.991071,0.957143,0.978022,0.021978
4,0.986726,0.949275,0.969565,0.977612,0.972527,0.027473
5,0.991189,0.970803,0.982533,0.985185,0.983516,0.016484
6,0.969957,0.954198,0.974138,0.94697,0.964286,0.035714
7,0.99095,0.944056,0.964758,0.985401,0.972527,0.027473
8,0.986425,0.937063,0.960352,0.978102,0.967033,0.032967
9,0.990566,0.960526,0.972222,0.986486,0.978022,0.021978


In [None]:
train_df['Acurácia total'].mean() * 100, train_df['Acurácia total'].std() * 100

In [23]:
train_df['Taxa de erro total'].mean() * 100, train_df['Taxa de erro total'].std() * 100

(2.4450549450549453, 0.640371199159234)

### Amostras de teste

In [14]:
def get_test_metrics(X, y, *, n_iter: int, verbose: bool = False):
    random.seed(0)
    clf = MLPClassifier(n_layers=4, layer_size=5, learning_rate=0.3, n_generations=100)

    results = []
    best = (float('-inf'), None)
    
    t0 = time.time()
    for i in range(n_iter):
        if verbose:
            print(f'Starting iteration {i} at {time.time() - t0:.2f} seconds')
            
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=0.3, random_state=i)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test.tolist())
        
        ((t_n, f_p), (f_n, t_p)) = confusion_matrix(y_test, y_pred)
        accuracy = (t_p + t_n) / (t_p + f_n + f_p + t_n)
        results.append({
            'Confiabilidade positiva': t_p / (t_p + f_p),
            'Confiabilidade negativa': t_n / (t_n + f_n),
            'Sensibilidade': t_p / (t_p + f_n),
            'Especificidade': t_n / (f_p + t_n),
            'Acurácia total': accuracy,
            'Taxa de erro total' : (f_p+f_n) / (t_p + f_n + f_p + t_n)
        })
        
        best_accuracy, _ = best
        if accuracy > best_accuracy:
            best = (accuracy, clf.layers)
        
    if verbose:
        print(f'Finished after {time.time() - t0:.2f} seconds')
        print()
        
    return pd.DataFrame(results), best

In [15]:
test_df, (best_accuracy, best_layers) = get_test_metrics(X, y, n_iter=10, verbose=True)
test_df

Starting iteration 0 at 0.00 seconds
Starting iteration 1 at 53.10 seconds
Starting iteration 2 at 105.15 seconds
Starting iteration 3 at 157.78 seconds
Starting iteration 4 at 210.07 seconds
Starting iteration 5 at 262.41 seconds
Starting iteration 6 at 314.80 seconds
Starting iteration 7 at 367.40 seconds
Starting iteration 8 at 419.52 seconds
Starting iteration 9 at 472.10 seconds
Finished after 524.48 seconds



Unnamed: 0,Confiabilidade positiva,Confiabilidade negativa,Sensibilidade,Especificidade,Acurácia total,Taxa de erro total
0,0.978495,0.952381,0.968085,0.967742,0.967949,0.032051
1,0.98913,0.890625,0.928571,0.982759,0.948718,0.051282
2,0.956989,0.920635,0.946809,0.935484,0.942308,0.057692
3,0.947917,0.916667,0.947917,0.916667,0.935897,0.064103
4,0.987654,0.866667,0.888889,0.984848,0.929487,0.070513
5,0.946237,0.952381,0.967033,0.923077,0.948718,0.051282
6,0.965116,0.928571,0.943182,0.955882,0.948718,0.051282
7,0.975309,0.813333,0.849462,0.968254,0.897436,0.102564
8,0.954545,0.867647,0.903226,0.936508,0.916667,0.083333
9,0.969388,0.844828,0.913462,0.942308,0.923077,0.076923


In [None]:
test_df['Acurácia total'].mean() * 100, test_df['Acurácia total'].std() * 100

In [25]:
test_df['Taxa de erro total'].mean() * 100, test_df['Taxa de erro total'].std() * 100

(6.41025641025641, 2.004451179411722)

In [26]:
test_df['Sensibilidade'].mean() * 100, test_df['Sensibilidade'].std() * 100

(92.56635096867593, 3.738688239576974)

In [None]:
best_accuracy

In [None]:
import json

json_ = []
for i, layer in enumerate(best_layers, start=1):
    json_.append({
        'layer': i,
        'weights': [perceptron.weights for perceptron in layer]
    })
    
print(json.dumps(json_, indent=4))