In [1]:
import os
import sys
sys.path.append('..')

## Pré-processamento

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv(
    os.path.join('..', '..', 'data', 'diabetes.csv'),
    true_values=('Yes', 'Positive', 'Male'),
    false_values=('No', 'Negative', 'Female'),
    dtype=int
)

In [4]:
df[['Age']] = StandardScaler().fit_transform(df[['Age']])
df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,-0.661367,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,0.821362,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,-0.578993,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,-0.249498,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,0.986110,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,-0.743741,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,-0.002376,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,0.821362,0,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,-1.320358,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


In [5]:
X = df.drop(columns=['class']).to_numpy()
y = df['class'].to_numpy()
X.shape, y.shape

((520, 16), (520,))

## Classificação

In [6]:
import time
import random 
from sklearn.metrics import confusion_matrix
from comparison.neural_network.model import MLPClassifier

### Todas as amostras

In [7]:
def get_total_metrics(X, y, *, n_iter: int, verbose: bool = False):
    results = []
    
    t0 = time.time()
    for i in range(n_iter):
        if verbose:
            print(f'Starting iteration {i} at {time.time() - t0:.2f} seconds')
            
        random.seed(i)
        
        clf = MLPClassifier(n_layers=4, layer_size=5, learning_rate=0.3, n_generations=100)
        clf.fit(X, y)
        
        y_pred = clf.predict(X.tolist())
        ((t_n, f_p), (f_n, t_p)) = confusion_matrix(y, y_pred)
        
        results.append({
            'Confiabilidade positiva': t_p / (t_p + f_p),
            'Confiabilidade negativa': t_n / (t_n + f_n),
            'Sensibilidade': t_p / (t_p + f_n),
            'Especificidade': t_n / (f_p + t_n),
            'Acurácia total': (t_p + t_n) / (t_p + f_n + f_p + t_n)
        })
        
    if verbose:
        print(f'Finished after {time.time() - t0:.2f} seconds')
        print()
    
    return pd.DataFrame(results)

In [8]:
total_df = get_total_metrics(X, y, n_iter=10, verbose=True)
total_df

Starting iteration 0 at 0.00 seconds
Starting iteration 1 at 113.61 seconds
Starting iteration 2 at 167.01 seconds
Starting iteration 3 at 280.88 seconds
Starting iteration 4 at 405.85 seconds
Starting iteration 5 at 534.51 seconds
Starting iteration 6 at 656.69 seconds
Starting iteration 7 at 773.67 seconds
Starting iteration 8 at 891.19 seconds
Starting iteration 9 at 1007.17 seconds
Finished after 1124.99 seconds



Unnamed: 0,Confiabilidade positiva,Confiabilidade negativa,Sensibilidade,Especificidade,Acurácia total
0,0.996835,0.97549,0.984375,0.995,0.988462
1,0.984424,0.979899,0.9875,0.975,0.982692
2,0.984424,0.979899,0.9875,0.975,0.982692
3,0.981308,0.974874,0.984375,0.97,0.978846
4,0.987616,0.994924,0.996875,0.98,0.990385
5,0.981308,0.974874,0.984375,0.97,0.978846
6,0.987461,0.975124,0.984375,0.98,0.982692
7,0.987616,0.994924,0.996875,0.98,0.990385
8,0.99375,0.99,0.99375,0.99,0.992308
9,0.987616,0.994924,0.996875,0.98,0.990385


In [9]:
total_df['Acurácia total'].mean() * 100, total_df['Acurácia total'].std() * 100

(98.57692307692307, 0.514420580711257)

### Amostras de treino

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
def get_train_metrics(X, y, *, n_iter: int, verbose: bool = False):
    random.seed(0)
    clf = MLPClassifier(n_layers=4, layer_size=5, learning_rate=0.3, n_generations=100)

    results = []
    
    t0 = time.time()
    for i in range(n_iter):
        if verbose:
            print(f'Starting iteration {i} at {time.time() - t0:.2f} seconds')
            
        X_train, _, y_train, _ = \
            train_test_split(X, y, test_size=0.3, random_state=i)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_train.tolist())
        
        ((t_n, f_p), (f_n, t_p)) = confusion_matrix(y_train, y_pred)
        results.append({
            'Confiabilidade positiva': t_p / (t_p + f_p),
            'Confiabilidade negativa': t_n / (t_n + f_n),
            'Sensibilidade': t_p / (t_p + f_n),
            'Especificidade': t_n / (f_p + t_n),
            'Acurácia total': (t_p + t_n) / (t_p + f_n + f_p + t_n)
        })
        
    if verbose:
        print(f'Finished after {time.time() - t0:.2f} seconds')
        print()
        
    return pd.DataFrame(results)

In [12]:
train_df = get_train_metrics(X, y, n_iter=10, verbose=True)
train_df

Starting iteration 0 at 0.00 seconds
Starting iteration 1 at 74.55 seconds
Starting iteration 2 at 153.83 seconds
Starting iteration 3 at 233.13 seconds
Starting iteration 4 at 313.25 seconds
Starting iteration 5 at 393.37 seconds
Starting iteration 6 at 472.87 seconds
Starting iteration 7 at 554.03 seconds
Starting iteration 8 at 627.34 seconds
Starting iteration 9 at 705.38 seconds
Finished after 784.01 seconds



Unnamed: 0,Confiabilidade positiva,Confiabilidade negativa,Sensibilidade,Especificidade,Acurácia total
0,0.982301,0.971014,0.982301,0.971014,0.978022
1,0.99537,0.952703,0.968468,0.992958,0.978022
2,0.995495,0.964789,0.977876,0.992754,0.983516
3,0.973684,0.985294,0.991071,0.957143,0.978022
4,0.986726,0.949275,0.969565,0.977612,0.972527
5,0.991189,0.970803,0.982533,0.985185,0.983516
6,0.969957,0.954198,0.974138,0.94697,0.964286
7,0.99095,0.944056,0.964758,0.985401,0.972527
8,0.986425,0.937063,0.960352,0.978102,0.967033
9,0.990566,0.960526,0.972222,0.986486,0.978022


In [13]:
train_df['Acurácia total'].mean() * 100, train_df['Acurácia total'].std() * 100

(97.55494505494505, 0.6403711991592317)

### Amostras de teste

In [14]:
def get_test_metrics(X, y, *, n_iter: int, verbose: bool = False):
    random.seed(0)
    clf = MLPClassifier(n_layers=4, layer_size=5, learning_rate=0.3, n_generations=100)

    results = []
    best = (float('-inf'), None)
    
    t0 = time.time()
    for i in range(n_iter):
        if verbose:
            print(f'Starting iteration {i} at {time.time() - t0:.2f} seconds')
            
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=0.3, random_state=i)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test.tolist())
        
        ((t_n, f_p), (f_n, t_p)) = confusion_matrix(y_test, y_pred)
        accuracy = (t_p + t_n) / (t_p + f_n + f_p + t_n)
        results.append({
            'Confiabilidade positiva': t_p / (t_p + f_p),
            'Confiabilidade negativa': t_n / (t_n + f_n),
            'Sensibilidade': t_p / (t_p + f_n),
            'Especificidade': t_n / (f_p + t_n),
            'Acurácia total': accuracy
        })
        
        best_accuracy, _ = best
        if accuracy > best_accuracy:
            best = (accuracy, clf.layers)
        
    if verbose:
        print(f'Finished after {time.time() - t0:.2f} seconds')
        print()
        
    return pd.DataFrame(results), best

In [15]:
test_df, (best_accuracy, best_layers) = get_test_metrics(X, y, n_iter=10, verbose=True)
test_df

Starting iteration 0 at 0.00 seconds
Starting iteration 1 at 79.05 seconds
Starting iteration 2 at 160.07 seconds
Starting iteration 3 at 238.39 seconds
Starting iteration 4 at 317.50 seconds
Starting iteration 5 at 397.18 seconds
Starting iteration 6 at 476.58 seconds
Starting iteration 7 at 555.67 seconds
Starting iteration 8 at 635.02 seconds
Starting iteration 9 at 714.94 seconds
Finished after 795.46 seconds



Unnamed: 0,Confiabilidade positiva,Confiabilidade negativa,Sensibilidade,Especificidade,Acurácia total
0,0.978495,0.952381,0.968085,0.967742,0.967949
1,0.98913,0.890625,0.928571,0.982759,0.948718
2,0.956989,0.920635,0.946809,0.935484,0.942308
3,0.947917,0.916667,0.947917,0.916667,0.935897
4,0.987654,0.866667,0.888889,0.984848,0.929487
5,0.946237,0.952381,0.967033,0.923077,0.948718
6,0.965116,0.928571,0.943182,0.955882,0.948718
7,0.975309,0.813333,0.849462,0.968254,0.897436
8,0.954545,0.867647,0.903226,0.936508,0.916667
9,0.969388,0.844828,0.913462,0.942308,0.923077


In [16]:
test_df['Acurácia total'].mean() * 100, test_df['Acurácia total'].std() * 100

(93.58974358974359, 2.0044511794117206)

In [17]:
best_accuracy

0.967948717948718

In [18]:
import json

json_ = []
for i, layer in enumerate(best_layers, start=1):
    json_.append({
        'layer': i,
        'weights': [perceptron.weights for perceptron in layer]
    })
    
print(json.dumps(json_, indent=4))

[
    {
        "layer": 1,
        "weights": [
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ],
            [
                1.0
            ]
        ]
    },
    {
        "layer": 2,
        "weights": [
            [
                -0.42779912862784897,
                -2.4746071537091483,
               