## Carregamento dos Dados e Preparação do Ambiente

### Instalação de Pacotes

In [1]:
! pip install pandas numpy matplotlib scikit-learn imbalanced-learn xgboost
! pip install mlflow --upgrade
! pip install optuna
! pip install shap

Collecting shap
  Downloading shap-0.49.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting numba>=0.54
  Downloading numba-0.62.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting slicer==0.0.8
  Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Collecting llvmlite<0.46,>=0.45.0dev0
  Downloading llvmlite-0.45.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (56.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: slicer, llvmlite, numba, shap
Successfully installed llvmlite-0.45.1 numba-0.62.1 shap-0.49.1 slicer-0.0.8


### Importação de Bibliotecas

In [2]:
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, f1_score,
    ConfusionMatrixDisplay
)

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import optuna
import shap

from utils.visualization import (
    plot_metrics_by_class, print_summary_metrics,
    print_summary_metrics_latex
)

### Definição de Constantes Auxiliares

In [3]:
random_state = 42

test_ratio = 0.15

train_ratio_holdout = 0.70
valid_ratio_holdout = 0.15

numeric_attrs = ["idade_paciente", "dias_sintomas_notificacao"]

### Obtenção e análise dos dados

In [4]:
df_sinan_processed = pd.read_csv("../data/3_gold/dataset-processed.csv")

X = df_sinan_processed.drop("severity", axis=1)
y = df_sinan_processed["severity"]

feature_names = X.columns.tolist()
target_names = ["low_risk", "alarm", "severe"]

y = y.map({name: idx for idx, name in enumerate(target_names)})

### Separação do Conjunto de Teste
Essa separação é realizada independentemente do algoritmo utilizado para particionar os dados entre treino e validação (holdout ou LOOCV).

In [5]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state, stratify=y)

In [6]:
# Fazendo a segunda divisão, para gerar o conjunto de treino e validação
final_test_size = valid_ratio_holdout / (train_ratio_holdout + test_ratio)
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=final_test_size, random_state=random_state, stratify=y_temp)

# Normalizando atributos numéricos
for col in numeric_attrs:
    mean = X_train[col].mean()
    std = X_train[col].std()
    X_train[col] = (X_train[col] - mean) / (std + 1e-8)
    X_valid[col] = (X_valid[col] - mean) / (std + 1e-8)
    X_test[col] = (X_test[col] - mean) / (std + 1e-8)

print(f"Conjuntos gerados com as seguintes proporções: ")
print(f"  Treinamento: {X_train.shape}")
print(f"  Validação: {X_valid.shape}")
print(f"  Teste: {X_test.shape}")

Conjuntos gerados com as seguintes proporções: 
  Treinamento: (501841, 42)
  Validação: (107538, 42)
  Teste: (107538, 42)


In [7]:
class_counts = Counter(y_train)

print(f"Distribuição original das classes no conjunto de treinamento: {class_counts}")

under_sampler = RandomUnderSampler(sampling_strategy={0: class_counts[1]}, random_state=random_state)
over_sampler = SMOTE(sampling_strategy={2: class_counts[1]}, random_state=random_state)

pipeline = Pipeline([
    ('under', under_sampler),
    ('over', over_sampler)
])

X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

print(f"Distribuição das classes após amostragem no conjunto de treinamento: {Counter(y_train_resampled)}")

Distribuição original das classes no conjunto de treinamento: Counter({0: 294594, 1: 187280, 2: 19967})
Distribuição das classes após amostragem no conjunto de treinamento: Counter({0: 187280, 1: 187280, 2: 187280})


## Hyperparameter Tuning com Optuna

### Definição da Função Objetivo

In [16]:

def suggest_rf_classifier(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 5, 30, step=5)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)
    max_features = trial.suggest_categorical("max_features", [None, "sqrt", "log2"])
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced", "balanced_subsample"])
    
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=random_state,
        class_weight=class_weight,
        n_jobs=-1
    )
    return clf

def suggest_xgb_classifier(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 5000, step=100)
    max_depth = trial.suggest_int("max_depth", 2, 15)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, step=0.01)
    subsample = trial.suggest_float("subsample", 0.5, 1.0, step=0.1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0, step=0.1)
    reg_alpha = trial.suggest_int("reg_alpha", 1, 50)
    reg_lambda = trial.suggest_int("reg_lambda", 1, 50)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)
    gamma = trial.suggest_float("gamma", 0, 5, step=0.5)
    
    clf = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        gamma=gamma,
        random_state=random_state,
        use_label_encoder=False,
        eval_metric='mlogloss',
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        min_child_weight=min_child_weight,
        tree_model='gpu_hist',
        device='cuda:0',
        predictor='gpu_predictor',
        n_jobs=-1
    )
    return clf

def objective(trial):
    classifier_name = trial.suggest_categorical("classifier", ["RandomForest", "XGBoost"])

    if classifier_name == "RandomForest":
        clf = suggest_rf_classifier(trial)
        clf.fit(X_train_resampled, y_train_resampled)
    else:
        clf = suggest_xgb_classifier(trial)
        clf.fit(X_train_resampled, y_train_resampled)

    y_pred = clf.predict(X_valid)
    f1 = f1_score(y_valid, y_pred, average='macro', zero_division=0)
    return f1

In [17]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
best_trial = study.best_trial

print("Best trial:")
print(f"  F1 Score: {best_trial.value}")
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

[I 2025-11-27 17:23:13,798] A new study created in memory with name: no-name-8f65b7c6-9922-4271-95a7-2ba71210453b
[I 2025-11-27 17:31:26,873] Trial 0 finished with value: 0.4909052318405324 and parameters: {'classifier': 'RandomForest', 'n_estimators': 700, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_features': None, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.4909052318405324.
Parameters: { "predictor", "tree_model", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2025-11-27 17:31:33,661] Trial 1 finished with value: 0.5352382603141815 and parameters: {'classifier': 'XGBoost', 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.23, 'subsample': 0.6, 'colsample_bytree': 0.30000000000000004, 'reg_alpha': 26, 'reg_lambda': 

KeyboardInterrupt: 

In [None]:
# Holdout set evaluation
if best_trial.params["classifier"] == "RandomForest":
    best_clf = suggest_rf_classifier(best_trial)
    best_clf.fit(X_train_resampled, y_train_resampled)
else:
    best_clf = suggest_xgb_classifier(best_trial)
    best_clf.fit(X_train_resampled, y_train_resampled, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=False)
    
y_pred_test = best_clf.predict(X_test)
print("Relatório de Classificação no Conjunto de Teste:")
print(classification_report(y_test, y_pred_test, target_names=target_names, zero_division=0))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_test, display_labels=target_names, cmap=plt.cm.Blues, normalize='true')

plt.savefig("../figures/optimized_clf_confusion_matrix.pdf")
plt.show()

In [None]:
fig, ax, report_dict = plot_metrics_by_class(y_test, y_pred_test, target_names=target_names)

plt.savefig("../figures/optimized_clf_metrics_by_class.pdf")
plt.show()

In [None]:
print_summary_metrics(report_dict)

In [None]:
print_summary_metrics_latex(report_dict)

## Interpretação dos Resultados com SHAP

In [None]:
# Using SHAP to explain the optimized model
explainer = shap.Explainer(best_clf, X_train_resampled)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
plt.savefig("../figures/optimized_clf_shap_summary_plot.pdf")
plt.show()