#### Importing and preprocessing the data

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pyfume.Clustering import Clusterer
from pyfume.EstimateAntecendentSet import AntecedentEstimator
from pyfume.EstimateConsequentParameters import ConsequentEstimator
from pyfume.SimpfulModelBuilder import SugenoFISBuilder
from pyfume.Tester import SugenoFISTester
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score
from numpy import clip, column_stack, argmax
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score

data = pd.read_csv('./data/wbco.csv', header=None)
indexes = ['LThick', 'UCellSize', 'UCellShape', 'MAdhesion', 'SECS', 'BlandC', 'NormNuc', 'Mitoses', 'Unknown', 'Target']
data.columns = indexes

indexes = indexes[:-1]

In [66]:
data.dtypes == int

LThick        False
UCellSize     False
UCellShape    False
MAdhesion     False
SECS          False
BlandC        False
NormNuc       False
Mitoses       False
Unknown       False
Target        False
dtype: bool

A variável 'BlandC' está atualmente definida como string pois possui algumas entradas com '?'. Como resolver este problema? O número de entradas sem valor é significativo?

In [67]:
na_perc = (data.BlandC[data.BlandC == '?']).count() / data.BlandC.count() * 100
print(str(na_perc) + '% percent of entries are null.')

2.28898426323319% percent of entries are null.


Selected aproach: fill missing values with the column's average.

In [68]:
avg = data.BlandC[data.BlandC != '?'].astype(float).mean()
data.loc[data.BlandC == '?', 'BlandC'] = avg
data.BlandC = data.BlandC.astype(int)

Creating a train-test split

In [69]:
data_x = data.drop('Target', axis = 1)
data_y = data.Target

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state=21)

Formating the data to numpy

In [70]:
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.array
y_test = y_test.array

#### Building the model

In [71]:
# %% Train model
regr = MLPClassifier(hidden_layer_sizes=(31,31,31),random_state=42, max_iter=500)
regr.fit(x_train, y_train)

# %% Get model predictions
y_pred_train = regr.predict(x_train)  # Predictions on training data
y_pred_test = regr.predict(x_test)    # Predictions on testing data

# %% Compute classification metrics
rec_score_train = recall_score(y_train, y_pred_train)
rec_score_test = recall_score(y_test, y_pred_test)
print("Training Recall: {:.3f}".format(rec_score_train))
print("Testing Recall: {:.3f}".format(rec_score_test))

Training Recall: 1.000
Testing Recall: 0.925
