In [2]:
! pip install --upgrade -q pandas numpy scikit-learn imbalanced-learn optuna

In [2]:
from collections import Counter

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    train_test_split, StratifiedKFold
)
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import optuna

In [3]:
RANDOM_STATE = 42

TEST_RATIO = 0.15

NUMERIC_ATTRS = ["idade_paciente", "dias_sintomas_notificacao"]

In [4]:
df = pd.read_csv("../data/3_gold/dataset-processed.csv")

X = df.drop("severity", axis=1)
y = df["severity"]

feature_names = X.columns.tolist()
target_names = ["low_risk", "alarm", "severe"]

y = y.map({name: idx for idx, name in enumerate(target_names)})

In [5]:
# Contagem do número de exemplos em cada classe
class_counts = Counter(y)
for target_class, count in class_counts.items():
    print(f"Número de exemplos na classe '{target_class}' ({target_names[target_class]}): {count}")

Número de exemplos na classe '0' (low_risk): 420850
Número de exemplos na classe '1' (alarm): 267544
Número de exemplos na classe '2' (severe): 28523


In [6]:
X_opt, X_test, y_opt, y_test = train_test_split(
    X, y, test_size=TEST_RATIO, random_state=RANDOM_STATE, stratify=y
)

X_cpu = X_opt.values
y_cpu = y_opt.values

In [7]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 5, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "random_state": RANDOM_STATE,
        "n_jobs": -1
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    scores = []

    for train_idx, valid_idx in skf.split(X_cpu, y_cpu):

        X_train_fold, y_train_fold = X_cpu[train_idx], y_cpu[train_idx]
        X_valid_fold, y_valid_fold = X_cpu[valid_idx], y_cpu[valid_idx]

        # --- Pre-processing (Normalization & Resampling) ---
        scaler = StandardScaler()
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_valid_fold = scaler.transform(X_valid_fold)

        class_counts = Counter(y_train_fold)

        target_count = class_counts[1]

        pipeline = Pipeline([
            ('under', RandomUnderSampler(sampling_strategy={0: target_count}, random_state=RANDOM_STATE)),
            ('over', SMOTE(sampling_strategy={2: target_count}, random_state=RANDOM_STATE))
        ])

        X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train_fold, y_train_fold)

        model = RandomForestClassifier(**params)

        model.fit(X_train_resampled, y_train_resampled)

        preds = model.predict(X_valid_fold)
        f1 = f1_score(y_valid_fold, preds, average='macro')
        scores.append(f1)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=14400, show_progress_bar=True, n_jobs=-1)
best_trial = study.best_trial

print("Best trial:")
print(f"  F1 Score: {best_trial.value}")
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

[I 2025-12-05 17:22:59,316] A new study created in memory with name: no-name-4915347d-27e0-411e-8721-958466ab0384


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
import os
import pickle

output_dir = "results/random_forest"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_optuna_study_path = os.path.join(output_dir, "optuna_study.pkl")

with open(output_optuna_study_path, "wb") as f:
    pickle.dump(study, f)

In [1]:
! pip install plotly



In [3]:
! pip install nbformat



In [4]:
import optuna.visualization as vis
import pickle

study_path = '../results/random_forest/optuna_study.pkl'

with open(study_path, 'rb') as f:
    study = pickle.load(f)

display(vis.plot_param_importances(study))
display(vis.plot_optimization_history(study))