In [None]:
from sklearn.metrics import f1_score, precision_recall_fscore_support, roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from scipy.stats import norm
from variables import *
import lightgbm as lgb
import xgboost as xgb
import pandas as pd
import numpy as np

# __Functions__

In [2]:
def excludeOutcomes(df, Outcomes):
    baseOutcomes = df.drop(columns=COLUMNS_TO_EXCLUDE_BY_OUTCOME[Outcomes]).copy()
    baseOutcomes = baseOutcomes.drop(columns=['onda', 'dataadm', 'direto_cti']).copy()

    baseOutcomes = baseOutcomes.dropna(subset=[Outcomes]).copy()

    baseOutcomes[Outcomes] = baseOutcomes[Outcomes].astype(int).copy()

    return baseOutcomes

def IC_95(medidas):
    media = np.mean(medidas)
    erro_padrao = np.std(medidas, ddof=1) / np.sqrt(len(medidas))
    intervalo = norm.ppf(0.975) * erro_padrao  # z-score para 95%

    return f"{media:.4f}({intervalo:.4f})"

def IC_95_percentage(medidas):
    media = np.mean(medidas)*100
    erro_padrao = np.std(medidas, ddof=1) / np.sqrt(len(medidas))*100
    intervalo = norm.ppf(0.975) * erro_padrao  # z-score para 95%
    
    return f"{media:.1f}({intervalo:.1f})"

def evaluateModel(y_true_folds, y_pred_folds, y_proba_folds, classes, modelo_nome):
    """Calculates mean metrics and confidence interval for the folds."""
    macro_f1s, micro_f1s, precisao_folds, recall_folds, f1_folds = [], [], [], [], []
    
    for y_true, y_pred in zip(y_true_folds, y_pred_folds):
        macro_f1s.append(f1_score(y_true, y_pred, average="macro"))
        micro_f1s.append(f1_score(y_true, y_pred, average="micro"))
        prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None, labels=classes, zero_division=0)
        precisao_folds.append(prec)
        recall_folds.append(rec)
        f1_folds.append(f1)

    precisao_folds = np.array(precisao_folds)
    recall_folds = np.array(recall_folds)
    f1_folds = np.array(f1_folds)

    print(f"{IC_95_percentage(macro_f1s)}\t{IC_95_percentage(micro_f1s)}",end="\t")

    for i, c in enumerate(classes):
        print(f"{IC_95_percentage(precisao_folds[:, i])}\t{IC_95_percentage(recall_folds[:, i])}\t{IC_95_percentage(f1_folds[:, i])}",end='\t')

    print()

def trainModels(df, target_col, classificador):
    """Performs Stratified K-Fold, trains a classifier, and evaluates the metrics."""
    X = df.drop(columns=[target_col])
    y = df[target_col]
    classes = np.unique(y)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    y_true_folds, y_pred_folds, y_proba_folds = [], [], []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if classificador == "LightGBM":
            modelo = lgb.LGBMClassifier(random_state=42, verbose=-1)
        else:
            raise ValueError("The classifier should be 'LightGBM'.")

        modelo.fit(X_train, y_train)
        y_pred = modelo.predict(X_test)
        y_proba = modelo.predict_proba(X_test)

        y_true_folds.append(y_test)
        y_pred_folds.append(y_pred)
        y_proba_folds.append(y_proba)

    evaluateModel(y_true_folds, y_pred_folds, y_proba_folds, classes, classificador)

def aurocMetric(df, target_col, classificador, onda, desfecho):
    """Runs K-Fold, trains the model, and evaluates the metrics with an ROC curve and confidence interval."""
    X = df.drop(columns=[target_col])
    y = df[target_col]
    classes = np.unique(y)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    y_true_folds, y_pred_folds, y_proba_folds = [], [], []
    auc_list = []
    fpr_all, tpr_all = [], []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if classificador == "LightGBM":
            modelo = lgb.LGBMClassifier(random_state=42, verbose=-1)
        else:
            raise ValueError("The classifier should be 'LightGBM'.")
        
        modelo.fit(X_train, y_train)

        y_pred = modelo.predict(X_test)
        y_proba = modelo.predict_proba(X_test)[:, 1]

        y_true_folds.append(y_test)
        y_pred_folds.append(y_pred)
        y_proba_folds.append(y_proba)

        fpr, tpr, _ = roc_curve(y_test, y_proba)
        auc_score = auc(fpr, tpr)
        auc_list.append(auc_score)

        fpr_all.append(fpr)
        tpr_all.append(tpr)

    print(f"{IC_95(auc_list)}")


# __Outcome prediction__

In [3]:
waves = ['Onda 2 e 3']
outcomes = ['intercorrencia_3_5_6_13_16']
classifier = "LightGBM"

## __with sociodemographic variables__

In [4]:
base_covid = pd.read_parquet("datasets/banco_completo_REGISTRO_COVID_28_08_processado_cardiopatia_sociodemographic.parquet")

nPacientes = base_covid.shape[0]
nVariaveis = base_covid.shape[1]
print(f"Number of patients: {nPacientes}\nNumber of variables: {nVariaveis}")

Number of patients: 16957
Number of variables: 68


In [5]:

print(f"Model\t#pacientes\t#variaveis\tMacroF1\tMicroF1\tprecision_0\trecall_0\tF1_0\tprecision_1\trecall_1\tF1_1")
for wave in waves:
    for outcome in outcomes:
        if wave in ['Onda 2 e 3']:
            df_wave = base_covid[base_covid['onda'].isin(['Onda 2', 'Onda 3'])].copy()

        df_wave_outcome = excludeOutcomes(df_wave, outcome)
        print(f"sociodemographic\t{df_wave_outcome.shape[0]}\t{df_wave_outcome.shape[1]}", end='\t')

        trainModels(df_wave_outcome, outcome, classifier)


Model	#pacientes	#variaveis	MacroF1	MicroF1	precision_0	recall_0	F1_0	precision_1	recall_1	F1_1
sociodemographic	10700	60	51.2(1.4)	94.6(0.1)	94.8(0.1)	99.8(0.1)	97.2(0.1)	40.2(15.4)	2.8(1.5)	5.2(2.7)	


In [6]:

print(f"Model\t#pacientes\t#variaveis\tAUROC")
for wave in waves:
    for outcome in outcomes:
        if wave in ['Onda 2 e 3']:
            df_wave = base_covid[base_covid['onda'].isin(['Onda 2', 'Onda 3'])].copy()

        df_wave_outcome = excludeOutcomes(df_wave, outcome)
        print(f"sociodemographic\t{df_wave_outcome.shape[0]}\t{df_wave_outcome.shape[1]}", end='\t')

        aurocMetric(df_wave_outcome, outcome, classifier, wave, outcome)

Model	#pacientes	#variaveis	AUROC
sociodemographic	10700	60	0.7601(0.0266)


## __without sociodemographic variables__

In [7]:
base_covid = pd.read_parquet("datasets/banco_completo_REGISTRO_COVID_28_08_processado_cardiopatia_no_sociodemographic.parquet")

nPacientes = base_covid.shape[0]
nVariaveis = base_covid.shape[1]
print(f"Number of patients: {nPacientes}\nNumber of variables: {nVariaveis}")

Number of patients: 16957
Number of variables: 61


In [8]:

print(f"Model\t#pacientes\t#variaveis\tMacroF1\tMicroF1\tprecision_0\trecall_0\tF1_0\tprecision_1\trecall_1\tF1_1")
for wave in waves:
    for outcome in outcomes:
        if wave in ['Onda 2 e 3']:
            df_wave = base_covid[base_covid['onda'].isin(['Onda 2', 'Onda 3'])].copy()

        df_wave_outcome = excludeOutcomes(df_wave, outcome)
        print(f"No sociodemographic\t{df_wave_outcome.shape[0]}\t{df_wave_outcome.shape[1]}", end='\t')

        trainModels(df_wave_outcome, outcome, classifier)


Model	#pacientes	#variaveis	MacroF1	MicroF1	precision_0	recall_0	F1_0	precision_1	recall_1	F1_1
No sociodemographic	10700	53	50.7(1.5)	94.5(0.1)	94.8(0.1)	99.7(0.1)	97.2(0.1)	29.5(18.6)	2.3(1.6)	4.2(2.9)	


In [9]:

print(f"Model\t#pacientes\t#variaveis\tAUROC")
for wave in waves:
    for outcome in outcomes:
        if wave in ['Onda 2 e 3']:
            df_wave = base_covid[base_covid['onda'].isin(['Onda 2', 'Onda 3'])].copy()

        df_wave_outcome = excludeOutcomes(df_wave, outcome)
        print(f"No sociodemographic\t{df_wave_outcome.shape[0]}\t{df_wave_outcome.shape[1]}", end='\t')

        aurocMetric(df_wave_outcome, outcome, classifier, wave, outcome)

Model	#pacientes	#variaveis	AUROC
No sociodemographic	10700	53	0.7527(0.0270)
