### Configuração inicial:


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [19]:
SEED_VALUE = 202407

diabetes = pd.read_csv("data/10 - Diabetes - Dados.csv")
diabetes

diabetes.drop(columns=['num'], inplace=True)

X = diabetes.drop(columns=['diabetes'])
y = diabetes['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED_VALUE)

def space():
    print("\n\n")

### KNN:


In [12]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

knn_predictions = knn_model.predict(X_test)
print("KNN confusion matrix:")
print(confusion_matrix(y_test, knn_predictions))
print("KNN Accuracy Score:", accuracy_score(y_test, knn_predictions))

KNN confusion matrix:
[[77 24]
 [25 28]]
KNN Accuracy Score: 0.6818181818181818


### RNA:


In [13]:
## HOLD OUT
mlp_model = MLPClassifier(max_iter=2000, random_state=SEED_VALUE)
mlp_model.fit(X_train, y_train)
mlp_predictions = mlp_model.predict(X_test)
print("MLP Hold-out Confusion Matrix:")
print(confusion_matrix(y_test, mlp_predictions))
print("MLP Hold-out Accuracy Score:", accuracy_score(y_test, mlp_predictions))

space()

## CV
param_grid = {
    'hidden_layer_sizes': [(i,) for i in range(1, 46, 10)],
    'alpha':np.arange(0.0001, 0.001, 0.01)
}
grid_search = GridSearchCV(MLPClassifier(max_iter=2000, random_state=SEED_VALUE), param_grid, cv=10, n_jobs=1)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
mlp_cv_model = grid_search.best_estimator_
mlp_cv_predictions = mlp_cv_model.predict(X_test)
print("MLP Cross-Validation Confusion Matrix:")
print(confusion_matrix(y_test, mlp_cv_predictions))
print("MLP Cross-Validation Accuracy Score:", accuracy_score(y_test, mlp_cv_predictions))



MLP Hold-out Confusion Matrix:
[[81 20]
 [29 24]]
MLP Hold-out Accuracy Score: 0.6818181818181818



Best Parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (31,)}
MLP Cross-Validation Confusion Matrix:
[[75 26]
 [23 30]]
MLP Cross-Validation Accuracy Score: 0.6818181818181818


### SVM:

In [14]:
## HOLD OUT
svm_model = SVC(kernel='rbf', random_state=SEED_VALUE)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
print("SVM Hold-out Confusion Matrix:")
print(confusion_matrix(y_test, svm_predictions))
print("SVM Hold-out Accuracy Score:", accuracy_score(y_test, svm_predictions))

space()

## CV
param_grid_svm = {
    'C': np.arange(1, 101, 10),
    'gamma': np.arange(0.01, 0.21, 0.05)
}
grid_search_svm = GridSearchCV(SVC(kernel='rbf', random_state=SEED_VALUE), param_grid_svm, cv=10, n_jobs=1)
grid_search_svm.fit(X_train, y_train)
print("Best Parameters:", grid_search_svm.best_params_)
cv_model_svm = grid_search_svm.best_estimator_
cv_svm_predictions = cv_model_svm.predict(X_test)
print("SVM Cross-Validation Confusion Matrix:")
print(confusion_matrix(y_test, cv_svm_predictions))
print("SVM Cross-Validation Accuracy Score:", accuracy_score(y_test, cv_svm_predictions))



SVM Hold-out Confusion Matrix:
[[86 15]
 [27 26]]
SVM Hold-out Accuracy Score: 0.7272727272727273



Best Parameters: {'C': 1, 'gamma': 0.01}
SVM Cross-Validation Confusion Matrix:
[[98  3]
 [47  6]]
SVM Cross-Validation Accuracy Score: 0.6753246753246753


### RF:

In [15]:
# HOLD OUT
rf_model = RandomForestClassifier(random_state=SEED_VALUE)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print("RF Hold-out Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))
print("RF Hold-out Accuracy Score:", accuracy_score(y_test, rf_predictions))

space()

# CV
param_grid_rf = {
    'n_estimators': np.arange(100, 301, 100),
    'max_features': np.arange(2, 10, 3)
}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=SEED_VALUE), param_grid_rf, cv=10, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
print("Best RF Parameters:", grid_search_rf.best_params_)
rf_cv_model = grid_search_rf.best_estimator_
rf_cv_predictions = rf_cv_model.predict(X_test)
print("RF Cross-Validation Confusion Matrix:")
print(confusion_matrix(y_test, rf_cv_predictions))
print("RF Cross-Validation Accuracy Score:", accuracy_score(y_test, rf_cv_predictions))



RF Hold-out Confusion Matrix:
[[83 18]
 [19 34]]
RF Hold-out Accuracy Score: 0.7597402597402597



Best RF Parameters: {'max_features': 2, 'n_estimators': 300}
RF Cross-Validation Confusion Matrix:
[[88 13]
 [19 34]]
RF Cross-Validation Accuracy Score: 0.7922077922077922


### TESTE DE NOVOS CASOS COM O MELHOR MODELO: RF - CROSS VALIDATION

In [20]:
diabetes_new_case = pd.read_csv("data/10 - Diabetes - Novos_Dados.csv")
diabetes_new_case_predictions = rf_cv_model.predict(diabetes_new_case.drop(columns='diabetes'))
diabetes_new_case['diabates'] = diabetes_new_case_predictions
print(diabetes_new_case)

   preg0nt  glucose  pressure  triceps  insulin  mass  pedigree  age diabetes  \
0        1      139        62       41      480  40.7       536   21      neg   
1        1       97        68       21        0  27.2      1095   22      neg   
2        0      109        88       30        0  32.5       855   38      pos   
3        9      171       110       24      240  45.4       721   54      pos   

  diabates  
0      neg  
1      neg  
2      pos  
3      pos  
