### Importações

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import f1_score, accuracy_score, fbeta_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

import os, joblib

### Carregar Dataset

In [None]:
datasets = {
    "Base": pd.read_csv('../../Data/risco_cardiovascular_base.csv'),
    "Features": pd.read_csv('../../Data/risco_cardiovascular_features.csv')
}

In [None]:
datasets["Base"].head()

In [None]:
datasets["Features"].head()


### Parâmetros

In [None]:
resampler = [
    SMOTE(random_state=42),
    RandomOverSampler(random_state=42),
    RandomUnderSampler(random_state=42),
    "passthrough"
]

models_config = {
    "SVM_linear": {
        "model": SVC(
            kernel="linear",
            class_weight="balanced",        # ajuda quando NÃO usar resampler
            probability=True,               # para AUC/curvas
            decision_function_shape="ovr",
            random_state=42
        ),
        "params": {
            "resampler": resampler,
            "pca": ["passthrough", PCA(n_components=0.95, random_state=42)],
            "classifier__C": [0.1, 1, 10, 100],
            "classifier__class_weight": [None, "balanced"]
        }
    },

    "SVM_rbf": {
        "model": SVC(
            kernel="rbf",
            probability=True,
            decision_function_shape="ovr",
            random_state=42
        ),
        "params": {
            "resampler": resampler,
            "pca": ["passthrough", PCA(n_components=0.95, random_state=42)],
            "classifier__C": [0.1, 1, 10, 100],
            "classifier__gamma": ["scale", "auto", 0.01, 0.1, 1],
            "classifier__class_weight": [None, "balanced"]
        }
    }
}

### Loop de Treinamento

In [None]:
# Pasta para salvar modelos
output_path = "../../Models/SVM_Models"
model_save_path = output_path
os.makedirs(model_save_path, exist_ok=True)

results_list = []

for ds_name, df in datasets.items():
    print(f"\n>>> Processando Dataset: {ds_name}")

    # Target multiclasses
    y = df["BP_Category"]
    X = df.drop(columns=["BP_Category"], errors="ignore")

    bmi_order = ["Underweight", "Normal Weight", "Overweight", "Obese"]
    bmi_col = ["BMI Category"]

    cat_cols = [c for c in X.select_dtypes(include=['object']).columns if c != "BMI Category"]
    num_cols = X.select_dtypes(exclude=['object']).columns.tolist()

    preprocessor = ColumnTransformer(transformers=[
        ('ordinal', OrdinalEncoder(categories=[bmi_order]), bmi_col),
        ('numeric', StandardScaler(), num_cols),
        ('categories', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

    # Split treino/teste
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # CV (10 folds)
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # Scorer
    f2_weighted = make_scorer(fbeta_score, beta=2, average="weighted")

    for model_name, config in models_config.items():
        print(f"\n--- Iniciando GridSearch para {model_name} (SVM) ---")


        pipeline = ImbPipeline(steps=[
            ("preprocessor", preprocessor),
            ("resampler", SMOTE(random_state=42)),   # placeholder: será substituído pelo grid (inclui passthrough)
            ("pca", "passthrough"),                  # placeholder: será substituído pelo grid
            ("classifier", config["model"]),
        ])

        grid = GridSearchCV(
            pipeline,
            config["params"],
            cv=cv,
            scoring={"f1_macro": "f1_macro", "accuracy": "accuracy", "f2_weighted": f2_weighted},
            refit="f1_macro",
            n_jobs=-1,
            verbose=1
        )

        grid.fit(X_train, y_train)

        cv_res = pd.DataFrame(grid.cv_results_)
        best_idx = grid.best_index_

        # Métricas de validação (CV) do melhor modelo
        val_f1 = grid.best_score_
        val_acc = cv_res.loc[best_idx, "mean_test_accuracy"]
        val_f2 = cv_res.loc[best_idx, "mean_test_f2_weighted"]

        # Métricas no teste
        y_pred = grid.predict(X_test)
        test_f1 = f1_score(y_test, y_pred, average="macro")
        test_acc = accuracy_score(y_test, y_pred)
        test_f2 = fbeta_score(y_test, y_pred, beta=2, average="weighted")

        # Salvar o melhor modelo
        model_filename = f"{model_name}_{ds_name}.pkl"
        joblib.dump(grid.best_estimator_, os.path.join(model_save_path, model_filename))

        results_list.append({
            "Model": model_name,
            "Dataset": ds_name,
            "Best_Params": str(grid.best_params_),
            "Val_F1_Macro": val_f1,
            "Val_F2_Weighted": val_f2,
            "Val_Accuracy": val_acc,
            "Test_F1_Macro": test_f1,
            "Test_F2_Weighted": test_f2,
            "Test_Accuracy": test_acc,
        })

# Resultados finais
results_df = pd.DataFrame(results_list).sort_values(by="Val_F1_Macro", ascending=False)
display(results_df)


### Resultados e Avaliação

In [None]:
pd.set_option("display.max_colwidth", None)

metrics_save_path = os.path.join(output_path, "Metrics")
os.makedirs(metrics_save_path, exist_ok=True)

df_results = pd.DataFrame(results_list)

for model_name in models_config.keys():
    df_model = df_results[df_results["Model"] == model_name].copy()

    filename = f"resultado_final_{model_name}.csv"
    df_model.to_csv(os.path.join(metrics_save_path, filename), index=False)
    
    print(f"\n--- Tabela Final: {model_name} ---")
    display(df_model[
        ["Dataset",
         "Val_F1_Macro", "Val_F2_Weighted", "Val_Accuracy",
         "Test_F1_Macro", "Test_F2_Weighted", "Test_Accuracy",
         "Best_Params"]
    ])

print(f"\nProcesso concluído! Os arquivos foram salvos em: {metrics_save_path}")

In [None]:
best_model_path = os.path.join(model_save_path, "SVM_rbf_Base.pkl")
best_model = joblib.load(best_model_path)

print("Carregado:", best_model_path)
print(best_model)

In [None]:
df_base = datasets["Base"].copy()
X = df_base.drop(columns=["BP_Category"], errors="ignore")
y = df_base["BP_Category"]

X = X.drop(columns=["HighRisk"], errors="ignore")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


### Classification Report

In [None]:
from sklearn.metrics import classification_report

y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))


### Matriz de Confusão

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

labels = sorted(y_test.unique())

cm = confusion_matrix(y_test, y_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)

plt.figure(figsize=(6, 5))
disp.plot(values_format="d")
plt.title("Matriz de Confusão (contagens)")
plt.show()


### Métricas resumidas

In [None]:
from sklearn.metrics import f1_score, fbeta_score, accuracy_score

acc = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average="macro")
f2_weighted = fbeta_score(y_test, y_pred, beta=2, average="weighted")

print(f"Accuracy     : {acc:.4f}")
print(f"F1 Macro     : {f1_macro:.4f}")
print(f"F2 Weighted  : {f2_weighted:.4f}")


### ROC-AU

In [None]:
from sklearn.metrics import roc_auc_score

y_proba = best_model.predict_proba(X_test)

auc_ovr_weighted = roc_auc_score(y_test, y_proba, multi_class="ovr", average="weighted")
auc_ovr_macro = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")

print(f"AUC-ROC OvR (weighted): {auc_ovr_weighted:.4f}")
print(f"AUC-ROC OvR (macro)   : {auc_ovr_macro:.4f}")
