In [10]:
import joblib
import pandas as pd
import seaborn as sns

from sklearn.metrics import (accuracy_score,
                             classification_report,
                             precision_score)

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from src.config import DADOS_TRATADOS
from src.models import (RANDOM_STATE,
                        construir_pipeline_modelo_classificacao,
                        treinar_e_validar_modelo_classificacao,
                        grid_search_cv_classificador,
                        organiza_resultados)
  
sns.set_theme(palette='bright')

In [2]:
# Carregamento da base de dados
df = pd.read_csv(DADOS_TRATADOS)
df.head(2)

Unnamed: 0,area_mean,area_se,area_worst,compactness_mean,compactness_se,compactness_worst,concave points_mean,concave points_se,concave points_worst,concavity_mean,...,radius_mean,radius_se,radius_worst,smoothness_mean,smoothness_worst,symmetry_mean,symmetry_worst,texture_mean,texture_worst,target
0,1001.0,153.4,2019.0,0.2776,0.04904,0.6656,0.1471,0.01587,0.2654,0.3001,...,17.99,1.095,25.38,0.1184,0.1622,0.2419,0.4601,10.38,17.33,M
1,1326.0,74.08,1956.0,0.07864,0.01308,0.1866,0.07017,0.0134,0.186,0.0869,...,20.57,0.5435,24.99,0.08474,0.1238,0.1812,0.275,17.77,23.41,M


In [3]:
le = LabelEncoder()

# Divisão entre features e target
X = df.drop(columns='target')
y = le.fit_transform(df['target'])

# Divisão entre conjunto de treino e de teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# Colunas numéricas
numerical_cols = X.columns.to_list()

In [4]:
# Pipeline para tratamento de colunas numéricas
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean') ),
    ('scaler', MinMaxScaler(feature_range=(-1, 1)))
])

# Transformador de colunas
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols)
])

# Validação cruzada
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [7]:
# Dicionário com pipelines
pipes = {
    'RandomForest': Pipeline([
        ('prep', preprocessor),
        ('clf', RandomForestClassifier(random_state=RANDOM_STATE))
    ]),
    'NaiveBayes': Pipeline([
        ('prep', preprocessor),
        ('clf', GaussianNB())
    ]),
    'LogisticRegression': Pipeline([
        ('prep', preprocessor),
        ('clf', LogisticRegression(solver='liblinear', random_state=RANDOM_STATE))
    ]),
    'KNN': Pipeline([
        ('prep', preprocessor),
        ('clf', KNeighborsClassifier())
    ])
}

# Dicionário com grids de hiperparâmetros
param_grids = {
    'RandomForest': {
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [None, 10, 20, 30]
    },
    'NaiveBayes': {
        'clf__var_smoothing': [1e-11, 1e-10, 1e-9, 1e-8, 1e-7]
    },
    'LogisticRegression': {
        'clf__penalty': ['l1', 'l2'],
        'clf__C': [0.01, 0.1, 1, 10],
        'clf__solver': ['liblinear']
    },
    'KNN': {
        'clf__n_neighbors': [3, 5, 7, 9],
        'clf__weights': ['uniform', 'distance'],
        'clf__metric': ['euclidean', 'manhattan']
    }
}

# Loop para busca em grade
best_pipes = {}

for nome_modelo, pipe in pipes.items():
    grid_search = GridSearchCV(pipe, param_grids[nome_modelo], cv=kf, scoring='f1_weighted')
    grid_search.fit(X_train, y_train)
    
    best_pipes[nome_modelo] = grid_search.best_estimator_
    print(f"Melhores parâmetros para {nome_modelo}: {grid_search.best_params_}\n")

Melhores parâmetros para RandomForest: {'clf__max_depth': None, 'clf__n_estimators': 100}

Melhores parâmetros para NaiveBayes: {'clf__var_smoothing': 1e-11}

Melhores parâmetros para LogisticRegression: {'clf__C': 10, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}

Melhores parâmetros para KNN: {'clf__metric': 'manhattan', 'clf__n_neighbors': 3, 'clf__weights': 'uniform'}



In [8]:
# Visualização das métricas de desempenho dos modelos
metricas = {
    'Acurácia': 'accuracy',
    'F1-Ponderado': 'f1_weighted',
    'Precisão-Ponderada': 'precision_weighted',
    'Recall-Ponderado': 'recall_weighted',
    'Acurácia-Balanceada': 'balanced_accuracy'
}

df_scores = {}

for nome_metrica, scoring in metricas.items():
    for modelo, pipe in best_pipes.items():
        score = cross_val_score(pipe, X, y, cv=kf, scoring=scoring)
        df_scores.setdefault(nome_metrica, {})[modelo] = format(score.mean(), ".3f")

df_scores = pd.DataFrame(df_scores)
display(df_scores)

Unnamed: 0,Acurácia,F1-Ponderado,Precisão-Ponderada,Recall-Ponderado,Acurácia-Balanceada
RandomForest,0.961,0.961,0.963,0.961,0.956
NaiveBayes,0.93,0.929,0.932,0.93,0.923
LogisticRegression,0.967,0.967,0.967,0.967,0.962
KNN,0.972,0.972,0.973,0.972,0.966


In [11]:
# Salvando o melhor pipeline do KNN
modelo_knn = best_pipes['KNN']
joblib.dump(modelo_knn, 'modelo_knn.pkl')

['modelo_knn.pkl']