Dado o desafio de implementar uma rede simples (MLP) usando o Keras, será utilizado um dataset que apresenta informções sobre mulheres diagnosticadas com câncer de mama. Entre as informações estão o tipo de câncer e se houve reincidência ou não. O objetivo do modelo é prever gravidade do câncer, isso acontece através da coluna output_os, se for 0 não é grave e se for 1 é.

# Importando as bibliotecas

In [2]:
!pip install tensorflow



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.metrics import Precision, Recall
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Baixando os dados

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Sistemas de informação /Módulo 11 - Aegea/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Análise Exploratória

In [6]:
df.Class.unique()

array([0, 1])

In [7]:
df.Amount.value_counts()

Unnamed: 0_level_0,count
Amount,Unnamed: 1_level_1
1.00,13688
1.98,6044
0.89,4872
9.99,4747
15.00,3280
...,...
271.63,1
207.42,1
3820.86,1
1088.04,1


# Separando os dados

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = df.drop(["Class"], axis=1)
y = df.Class

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [11]:
model = Sequential()
model.add(Dense(1, activation='sigmoid', input_shape=(X_train.shape[1],)))

In [12]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Modelagem

In [13]:
class F1Metric(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if 'accuracy' in logs:
            y_pred = (self.model.predict(X_test) > 0.5).astype("int32")
            f1 = f1_score(y_test, y_pred)
            print(f"Epoch {epoch+1} - F1 Score: {f1}")

In [14]:
model.fit(X_train, y_train, epochs=10, batch_size=10, callbacks=[F1Metric()])

Epoch 1/10
Epoch 1 - F1 Score: 0.11864406779661017
Epoch 2/10
Epoch 2 - F1 Score: 0.21614583333333331
Epoch 3/10
Epoch 3 - F1 Score: 0.36458333333333337
Epoch 4/10
Epoch 4 - F1 Score: 0.44878048780487806
Epoch 5/10
Epoch 5 - F1 Score: 0.4631578947368421
Epoch 6/10
Epoch 6 - F1 Score: 0.5573770491803278
Epoch 7/10
Epoch 7 - F1 Score: 0.3850931677018634
Epoch 8/10
Epoch 8 - F1 Score: 0.42452830188679247
Epoch 9/10
Epoch 9 - F1 Score: 0.5174129353233831
Epoch 10/10
Epoch 10 - F1 Score: 0.5506072874493928


<keras.src.callbacks.History at 0x7cf05c0c6d70>

In [15]:
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype("int32")



# Avaliação

* Accuracy
* F1-Score

In [16]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)

print(f"Acurácia: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precisão: {precision}")
print(f"Recall: {recall}")
print(f"AUC-ROC: {auc_roc}")


Acurácia: 0.9980513324672589
F1 Score: 0.5506072874493928
Precisão: 0.46258503401360546
Recall: 0.68
AUC-ROC: 0.8393053357250889


In [17]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, roc_auc_score

model = RandomForestClassifier(random_state=3)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Melhores Hiperparâmetros:", grid_search.best_params_)

y_pred = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 81 candidates, totalling 405 fits


  pid = os.fork()
  pid = os.fork()


KeyboardInterrupt: 

O caderno atual se diferencia por ser uma evolução do apresentado duas semanas atrás na ponderada anterior. Dessa vez há o desafio do overfitting a ser superado. Para isso, a adição de hiperparêmetros é a estratégia escolhida.
