# Treinamento do modelo

Iniciamos abrindo e analisando o schema do nosso dataset.

In [1]:
import pandas as pd

df = pd.read_csv('diabetes_prediction_dataset.csv')

In [48]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


Começamos examinando a primeira coluna categórica: a de histórico de fumante.

In [49]:
df['smoking_history'].value_counts()

No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: smoking_history, dtype: int64

Como a maior parte dos valores é nulo, excluímos essa feature.

In [50]:
df.drop('smoking_history', axis=1, inplace=True)

Agora, checamos os valores da coluna target.

In [51]:
df['diabetes'].value_counts()

0    91500
1     8500
Name: diabetes, dtype: int64

Percebemos que existe um desbalanceamento de classes que terá de ser tratado no futuro.

In [52]:
df_filtered = df[df['gender'] != 'Other']


Logo depois, simplificamos a coluna de gênero mantendo apenas masculino e feminino, e fazemos um one-hot encoding nessa coluna.

In [53]:
df_encoded = pd.get_dummies(df_filtered, columns=['gender'], prefix=['gender'])

In [54]:
df_encoded

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male
0,80.0,0,1,25.19,6.6,140,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0
2,28.0,0,0,27.32,5.7,158,0,0,1
3,36.0,0,0,23.45,5.0,155,0,1,0
4,76.0,1,1,20.14,4.8,155,0,0,1
...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,1,0
99996,2.0,0,0,17.37,6.5,100,0,1,0
99997,66.0,0,0,27.83,5.7,155,0,0,1
99998,24.0,0,0,35.42,4.0,100,0,1,0


O próxima passo é fazer um undersampling para balancear as classes.

In [55]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification

X = df_encoded.drop('diabetes', axis=1)
y = df_encoded['diabetes']

# Instantiate the SMOTE object
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Fit and apply SMOTE to the dataset
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the class distribution after oversampling
print("Class distribution after SMOTE:")
unique, counts = np.unique(y_resampled, return_counts=True)
print(dict(zip(unique, counts)))


Class distribution after SMOTE:
{0: 91482, 1: 91482}


In [74]:
df_resampled = pd.DataFrame(data=X_resampled, columns=X.columns)
df_resampled['diabetes'] = y_resampled

In [75]:
df_resampled

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,diabetes
0,80.000000,0,1,25.190000,6.600000,140,1,0,0
1,54.000000,0,0,27.320000,6.600000,80,1,0,0
2,28.000000,0,0,27.320000,5.700000,158,0,1,0
3,36.000000,0,0,23.450000,5.000000,155,1,0,0
4,76.000000,1,1,20.140000,4.800000,155,0,1,0
...,...,...,...,...,...,...,...,...,...
182959,56.496686,0,0,45.705546,5.811754,200,1,0,1
182960,80.000000,0,0,29.273938,8.994570,160,0,1,1
182961,47.000000,0,0,27.736877,6.120674,130,0,0,1
182962,53.000000,0,0,25.707678,7.500000,240,1,0,1


Então, transformamos todos os valores em inteiros.

In [76]:
df_resampled = df_resampled.astype(int)


In [78]:
df_resampled

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,diabetes
0,80,0,1,25,6,140,1,0,0
1,54,0,0,27,6,80,1,0,0
2,28,0,0,27,5,158,0,1,0
3,36,0,0,23,5,155,1,0,0
4,76,1,1,20,4,155,0,1,0
...,...,...,...,...,...,...,...,...,...
182959,56,0,0,45,5,200,1,0,1
182960,80,0,0,29,8,160,0,1,1
182961,47,0,0,27,6,130,0,0,1
182962,53,0,0,25,7,240,1,0,1


In [90]:
X = df_resampled.drop('diabetes', axis=1)

# Create y with only the 'diabetes' column
y = df_resampled['diabetes']

In [91]:
X

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male
0,80,0,1,25,6,140,1,0
1,54,0,0,27,6,80,1,0
2,28,0,0,27,5,158,0,1
3,36,0,0,23,5,155,1,0
4,76,1,1,20,4,155,0,1
...,...,...,...,...,...,...,...,...
182959,56,0,0,45,5,200,1,0
182960,80,0,0,29,8,160,0,1
182961,47,0,0,27,6,130,0,0
182962,53,0,0,25,7,240,1,0


Finalmente, começamos a testar diferentes algoritmos de treinamento, comparando as métricas na situação de treino e teste.

In [85]:
# Import the necessary libraries
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a logistic regression classifier
logistic_regression_classifier = LogisticRegression()

# Fit the classifier on the training data
logistic_regression_classifier.fit(X_train, y_train)

# Make predictions on the training data
y_train_pred = logistic_regression_classifier.predict(X_train)

# Make predictions on the test data
y_test_pred = logistic_regression_classifier.predict(X_test)

# Calculate accuracy, F1 score, and recall for training data
accuracy_train = accuracy_score(y_train, y_train_pred)
f1_score_train = f1_score(y_train, y_train_pred)
recall_train = recall_score(y_train, y_train_pred)

# Calculate accuracy, F1 score, and recall for testing data
accuracy_test = accuracy_score(y_test, y_test_pred)
f1_score_test = f1_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)

# Print the results
print("Training Accuracy:", accuracy_train)
print("Training F1 Score:", f1_score_train)
print("Training Recall:", recall_train)
print()
print("Testing Accuracy:", accuracy_test)
print("Testing F1 Score:", f1_score_test)
print("Testing Recall:", recall_test)

scores = cross_val_score(logistic_regression_classifier, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())


Training Accuracy: 0.8923981448225244
Training F1 Score: 0.8913470256632634
Training Recall: 0.8841184934934935

Testing Accuracy: 0.8895245035525596
Testing F1 Score: 0.8890312191194231
Testing Recall: 0.8818340230886517
Cross-Validation Scores: [0.88134343 0.89416009 0.89500724 0.89325827 0.89180695]
Mean Accuracy: 0.891115195840474


In [86]:
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score

naive_bayes_classifier = GaussianNB()

# Fit the classifier on the training data
naive_bayes_classifier.fit(X_train, y_train)

# Make predictions on the training data
y_train_pred = naive_bayes_classifier.predict(X_train)

# Make predictions on the test data
y_test_pred = naive_bayes_classifier.predict(X_test)

# Calculate accuracy, F1 score, and recall for training data
accuracy_train = accuracy_score(y_train, y_train_pred)
f1_score_train = f1_score(y_train, y_train_pred, average='weighted')
recall_train = recall_score(y_train, y_train_pred, average='weighted')

# Calculate accuracy, F1 score, and recall for testing data
accuracy_test = accuracy_score(y_test, y_test_pred)
f1_score_test = f1_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')

# Print the results
print("Training Accuracy:", accuracy_train)
print("Training F1 Score:", f1_score_train)
print("Training Recall:", recall_train)
print()
print("Testing Accuracy:", accuracy_test)
print("Testing F1 Score:", f1_score_test)
print("Testing Recall:", recall_test)

scores = cross_val_score(naive_bayes_classifier, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())

Training Accuracy: 0.8377266267938848
Training F1 Score: 0.8374772218794979
Training Recall: 0.8377266267938848

Testing Accuracy: 0.8342503188194571
Testing F1 Score: 0.834058511343236
Testing Recall: 0.8342503188194571
Cross-Validation Scores: [0.85603804 0.83392999 0.83425792 0.83474982 0.83119261]
Mean Accuracy: 0.8380336741595956


In [87]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


# Create a Random Forest classifier with 100 trees (you can adjust this)
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the training data
random_forest_classifier.fit(X_train, y_train)

# Make predictions on the training data
y_train_pred = random_forest_classifier.predict(X_train)

# Make predictions on the test data
y_test_pred = random_forest_classifier.predict(X_test)

# Calculate accuracy, F1 score, and recall for training data
accuracy_train = accuracy_score(y_train, y_train_pred)
f1_score_train = f1_score(y_train, y_train_pred, average='weighted')
recall_train = recall_score(y_train, y_train_pred, average='weighted')

# Calculate accuracy, F1 score, and recall for testing data
accuracy_test = accuracy_score(y_test, y_test_pred)
f1_score_test = f1_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')

# Print the results
print("Training Accuracy:", accuracy_train)
print("Training F1 Score:", f1_score_train)
print("Training Recall:", recall_train)
print()
print("Testing Accuracy:", accuracy_test)
print("Testing F1 Score:", f1_score_test)
print("Testing Recall:", recall_test)

scores = cross_val_score(random_forest_classifier, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())

Training Accuracy: 0.9754673079625842
Training F1 Score: 0.9754659246119887
Training Recall: 0.9754673079625842

Testing Accuracy: 0.944598287484059
Testing F1 Score: 0.9445871120514933
Testing Recall: 0.944598287484059
Cross-Validation Scores: [0.9351242  0.95105621 0.94977181 0.95201268 0.95182007]
Mean Accuracy: 0.947956996256497


In [88]:
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Create an ExtraTreesClassifier
extra_trees = ExtraTreesClassifier(random_state=42)

# Train the classifier on the training data
extra_trees.fit(X_train, y_train)

# Make predictions on the training and test data
y_train_pred = extra_trees.predict(X_train)
y_test_pred = extra_trees.predict(X_test)

# Calculate and print metrics for the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='weighted')
train_f1 = f1_score(y_train, y_train_pred, average='weighted')

print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"F1-score: {train_f1:.2f}")

# Calculate and print metrics for the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print("\nTesting Metrics:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1-score: {test_f1:.2f}")

scores = cross_val_score(extra_trees, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())


Training Metrics:
Accuracy: 0.98
Recall: 0.98
F1-score: 0.98

Testing Metrics:
Accuracy: 0.94
Recall: 0.94
F1-score: 0.94
Cross-Validation Scores: [0.93490558 0.95100156 0.9506463  0.95187604 0.95182007]
Mean Accuracy: 0.948049910201787


Por simplicidade e menor tendência a overfitting, escolhemos salvar o logistic regression model. Ele foi o meio termo entre o Naive Bayes e os modelos de árvore que tiveram um desempenho duvidoso, por terem alcançados métricas boas demais.

In [89]:
import pickle

# Save the trained logistic regression model to a pickle file
with open('logistic_regression_model.pkl', 'wb') as model_file:
    pickle.dump(logistic_regression_classifier, model_file)