In [1]:
%matplotlib inline

### Get performance and fairness scores of all trained models

In [None]:
# get the columns names - scores list
import numpy as np

temp = "../../src/data/artifacts/acs_income/LogisticRegression/fold_0/LogisticRegression_scores.npy"
scores = np.load(temp, allow_pickle=True).item()
scores_names = list(scores["scores"].keys())
generic_metrics_names = list(scores["generic_metrics"].keys())
generic_metrics_names

In [None]:
import os

import numpy as np
import pandas as pd

artifacts_dir = "../../src/data/artifacts/acs_income"
df_scores = pd.DataFrame(columns=["model_name", "kfold", *scores_names])
df_generic_metrics = pd.DataFrame(columns=["model_name", "kfold", *generic_metrics_names])

for model in os.listdir(artifacts_dir):
    model_dir = os.path.join(artifacts_dir, model)
    for kfolds in os.listdir(model_dir):
        kfold_path = os.path.join(model_dir, kfolds)
        for files in os.listdir(kfold_path):
            file_path = os.path.join(kfold_path, files)
            if file_path.endswith("_scores.npy"):
                res = np.load(file_path, allow_pickle=True).item()

                scores = res["scores"]
                data_scores = {
                    "model_name": res["model_name"],
                    "kfold": res["kfold"],
                    **scores,
                }
                df_scores.loc[len(df_scores)] = pd.Series(data_scores)

                generic_metrics = res["generic_metrics"]
                data_generic_metrics = {
                    "model_name": res["model_name"],
                    "kfold": res["kfold"],
                    **generic_metrics,
                }
                df_generic_metrics.loc[len(df_generic_metrics)] = pd.Series(data_generic_metrics)

df_scores.head()

In [None]:
df_generic_metrics.head()

In [None]:
df_scores.columns

### Get performance metrics values of all models

In [6]:
def get_confidence_interval(scores):
    from scipy import stats

    mean = scores.mean()
    sem = stats.sem(scores)
    ci = stats.t.interval(0.95, len(scores) - 1, loc=mean, scale=sem)
    return ci

### From Scores Dataframe

In [None]:
# selectt the metric to see the values
metric = "BAL_ACC"

scores = df_scores.groupby(["model_name"])[[metric]]
means, stds, zscores, accs, upper_ci, lower_ci, models = [], [], [], [], [], [], []

for model in scores.groups.keys():
    models.append(model)
    mean_acc = scores.get_group(model)[metric].mean()
    std_acc = scores.get_group(model)[metric].std(ddof=1)

    # means and stds
    means.append(mean_acc)
    stds.append(std_acc)

    accs = scores.get_group(model)[metric]
    ci = get_confidence_interval(accs)
    lower_ci.append(np.abs(mean_acc - ci[0]))
    upper_ci.append(np.abs(mean_acc - ci[1]))

# create a dataframe with the results lists
df = pd.DataFrame(
    {
        "model": models,
        "metric": metric,
        "mean": means,
        "std": stds,
        "lower_ci": lower_ci,
        "upper_ci": upper_ci,
    }
)

df

## p-value BAL_ACC

In [None]:
from scipy.stats import mannwhitneyu

# T-test (pairwise comparison between two models)
acc_0 = scores.get_group("DecisionTreeClassifier")["BAL_ACC"].tolist()
acc_1 = scores.get_group("LogisticRegression")["BAL_ACC"].tolist()
acc_2 = scores.get_group("RandomForestClassifier")["BAL_ACC"].tolist()
acc_3 = scores.get_group("XGBClassifier")["BAL_ACC"].tolist()
acc_4 = scores.get_group("MLPClassifier")["BAL_ACC"].tolist()

U1, p = mannwhitneyu(acc_0, acc_1, method="exact")
print(f"U1: {U1}, p-value: {p}")

U2, p = mannwhitneyu(acc_0, acc_2, method="exact")
print(f"U2: {U2}, p-value: {p}")

U3, p = mannwhitneyu(acc_0, acc_3, method="exact")
print(f"U3: {U3}, p-value: {p}")

U4, p = mannwhitneyu(acc_0, acc_4, method="exact")
print(f"U4: {U4}, p-value: {p}")

U5, p = mannwhitneyu(acc_1, acc_2, method="exact")
print(f"U5: {U5}, p-value: {p}")

U6, p = mannwhitneyu(acc_1, acc_3, method="exact")
print(f"U6: {U6}, p-value: {p}")

U7, p = mannwhitneyu(acc_1, acc_4, method="exact")
print(f"U7: {U7}, p-value: {p}")

U8, p = mannwhitneyu(acc_2, acc_3, method="exact")
print(f"U8: {U8}, p-value: {p}")

U9, p = mannwhitneyu(acc_2, acc_4, method="exact")
print(f"U9: {U9}, p-value: {p}")

U10, p = mannwhitneyu(acc_3, acc_4, method="exact")
print(f"U10: {U10}, p-value: {p}")

In [None]:
_, p = mannwhitneyu(acc_3, acc_0, method="exact")
print(f"XGBClassifier x DecisionTreeClassifier p-value: {p.round(5)}")

_, p = mannwhitneyu(acc_3, acc_1, method="exact")
print(f"XGBClassifier x LogisticRegression p-value: {p.round(5)}")

_, p = mannwhitneyu(acc_3, acc_2, method="exact")
print(f"XGBClassifier x RandomForestClassifier p-value: {p.round(5)}")

_, p = mannwhitneyu(acc_3, acc_4, method="exact")
print(f"XGBClassifier x MLPClassifier p-value: {p}")

## p-value PPV

In [None]:
from scipy.stats import mannwhitneyu

# T-test (pairwise comparison between two models)
scores = df_scores.groupby(["model_name"])[["PPV"]]
ppv_0 = scores.get_group("DecisionTreeClassifier")["PPV"].tolist()
ppv_1 = scores.get_group("LogisticRegression")["PPV"].tolist()
ppv_2 = scores.get_group("RandomForestClassifier")["PPV"].tolist()
ppv_3 = scores.get_group("XGBClassifier")["PPV"].tolist()
ppv_4 = scores.get_group("MLPClassifier")["PPV"].tolist()

U1, p = mannwhitneyu(ppv_0, ppv_1, method="exact")
print(f"U1: {U1}, p-value: {p}")

U2, p = mannwhitneyu(ppv_0, ppv_2, method="exact")
print(f"U2: {U2}, p-value: {p}")

U3, p = mannwhitneyu(ppv_0, ppv_3, method="exact")
print(f"U3: {U3}, p-value: {p}")

U4, p = mannwhitneyu(ppv_0, ppv_4, method="exact")
print(f"U4: {U4}, p-value: {p}")

U5, p = mannwhitneyu(ppv_1, ppv_2, method="exact")
print(f"U5: {U5}, p-value: {p}")

U6, p = mannwhitneyu(ppv_1, ppv_3, method="exact")
print(f"U6: {U6}, p-value: {p}")

U7, p = mannwhitneyu(ppv_1, ppv_4, method="exact")
print(f"U7: {U7}, p-value: {p}")

U8, p = mannwhitneyu(ppv_2, ppv_3, method="exact")
print(f"U8: {U8}, p-value: {p}")

U9, p = mannwhitneyu(ppv_2, ppv_4, method="exact")
print(f"U9: {U9}, p-value: {p}")

U10, p = mannwhitneyu(ppv_3, ppv_4, method="exact")
print(f"U10: {U10}, p-value: {p}")

In [None]:
scores = df_scores.groupby(["model_name"])[["PPV"]]
ppv_0 = scores.get_group("DecisionTreeClassifier")["PPV"].tolist()
ppv_1 = scores.get_group("LogisticRegression")["PPV"].tolist()
ppv_2 = scores.get_group("RandomForestClassifier")["PPV"].tolist()
ppv_3 = scores.get_group("XGBClassifier")["PPV"].tolist()
ppv_4 = scores.get_group("MLPClassifier")["PPV"].tolist()

_, p = mannwhitneyu(ppv_3, ppv_0, method="exact")
print(f"XGBClassifier x DecisionTreeClassifier p-value: {p.round(5)}")

_, p = mannwhitneyu(ppv_3, ppv_1, method="exact")
print(f"XGBClassifier x LogisticRegression p-value: {p.round(5)}")

_, p = mannwhitneyu(ppv_3, ppv_2, method="exact")
print(f"XGBClassifier x RandomForestClassifier p-value: {p.round(5)}")

_, p = mannwhitneyu(ppv_3, ppv_4, method="exact")
print(f"XGBClassifier x MLPClassifier p-value: {p}")

## p-value TPR

In [None]:
scores = df_scores.groupby(["model_name"])[["TPR"]]
TPR_0 = scores.get_group("DecisionTreeClassifier")["TPR"].tolist()
TPR_1 = scores.get_group("LogisticRegression")["TPR"].tolist()
TPR_2 = scores.get_group("RandomForestClassifier")["TPR"].tolist()
TPR_3 = scores.get_group("XGBClassifier")["TPR"].tolist()
TPR_4 = scores.get_group("MLPClassifier")["TPR"].tolist()

_, p = mannwhitneyu(TPR_3, TPR_0, method="exact")
print(f"XGBClassifier x DecisionTreeClassifier p-value: {p.round(5)}")

_, p = mannwhitneyu(TPR_3, TPR_1, method="exact")
print(f"XGBClassifier x LogisticRegression p-value: {p.round(5)}")

_, p = mannwhitneyu(TPR_3, TPR_2, method="exact")
print(f"XGBClassifier x RandomForestClassifier p-value: {p.round(5)}")

_, p = mannwhitneyu(TPR_3, TPR_4, method="exact")
print(f"XGBClassifier x MLPClassifier p-value: {p}")

In [13]:
# selectt the metric to see the values
metrics = ["BAL_ACC", "PPV", "TPR"]
latex_acc, latex_ppv, latex_tpr, latex_f1 = [], [], [], []
plot_acc, plot_ppv, plot_tpr, plot_f1 = [], [], [], []
plot_acc_ci, plot_ppv_ci, plot_tpr_ci, plot_f1_ci = [], [], [], []

for metric in metrics:
    scores = df_scores.groupby(["model_name"])[[metric]]
    means, accs, upper_ci, lower_ci, models = [], [], [], [], []

    for model in scores.groups.keys():
        models.append(model)
        mean_acc = scores.get_group(model)[metric].mean()
        means.append(mean_acc)
        accs = scores.get_group(model)[metric]
        ci = get_confidence_interval(accs)
        lower_ci.append(np.abs(mean_acc - ci[0]))
        upper_ci.append(np.abs(mean_acc - ci[1]))

    for a, b in zip(means, lower_ci):
        metric_str = f"{a:.3f} $\pm$ {b:.3f}"

        if metric == "BAL_ACC":
            latex_acc.append(metric_str)
            plot_acc.append(a.round(3))
            plot_acc_ci.append(b.round(3))
        elif metric == "PPV":
            latex_ppv.append(metric_str)
            plot_ppv.append(a.round(3))
            plot_ppv_ci.append(b.round(3))
        elif metric == "TPR":
            latex_tpr.append(metric_str)
            plot_tpr.append(a.round(3))
            plot_tpr_ci.append(b.round(3))

### From Generic Metrics Dataframe

In [None]:
df_generic_metrics.columns

In [None]:
# selectt the metric to see the values
metric = "F1_MACRO"

generic_performance_scores = df_generic_metrics.groupby(["model_name"])[[metric]]
means, accs, upper_ci, lower_ci, models = [], [], [], [], []
plot_f1, plot_f1_ci = [], []

for model in generic_performance_scores.groups.keys():
    models.append(model)
    mean_acc = generic_performance_scores.get_group(model)[metric].mean()
    means.append(mean_acc)
    accs = generic_performance_scores.get_group(model)[metric]
    ci = get_confidence_interval(accs)
    lower_ci.append(np.abs(mean_acc - ci[0]))
    upper_ci.append(np.abs(mean_acc - ci[1]))


for a, b in zip(means, lower_ci):
    metric_str = f"{a:.3f} $\pm$ {b:.3f}"
    latex_f1.append(metric_str)
    plot_f1.append(a.round(3))
    plot_f1_ci.append(b.round(3))

df = pd.DataFrame({"model": models, "metric": metric, "mean": means, "pm_ci": lower_ci})
df

In [None]:
data = {
    "BAL_ACC": latex_acc,
    "PPV": latex_ppv,
    "TPR": latex_tpr,
    "F1_MACRO": latex_f1,
}

df_latex = pd.DataFrame(data, index=["DT", "LG", "NN", "RF", "XGB"])

# Convert DataFrame to LaTeX formatted table
latex_table = df_latex.to_latex(escape=False)
print(latex_table)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl

mpl.rcParams["figure.dpi"] = 100
plt.rcParams["axes.labelsize"] = 8
plt.rcParams["lines.markersize"] = 4
plt.rcParams["ytick.labelsize"] = 8

colors = plt.get_cmap("Set1")
# colors = [c(1),c(6), c(10), c(14)]


def plot_model_metrics(models, models_means, models_erros):
    """
    Create a grouped bar chart of model metrics.

    :param models: List of model names
    :param models_means: Dictionary with metrics as keys and lists of values for each model as values
    """
    # Set up the plot
    fig, ax = plt.subplots(figsize=(8, 4))

    # Number of metrics and models
    num_metrics = len(models_means)
    num_models = len(models)

    # Set width of each bar and positions of the bars
    bar_width = 0.15
    r = np.arange(num_models)

    # Plot bars for each metric
    for i, (metric, values) in enumerate(models_means.items()):
        position = [x + bar_width * i for x in r]
        plt.bar(
            position, values, width=bar_width, label=metric, color=colors(i + 1), yerr=models_erros[metric], alpha=0.9
        )

    plt.xlabel("Models")
    plt.ylabel("Scores")
    plt.title("Training performance metrics over 10 cross-validation folds", fontsize=10)
    plt.xticks([r + bar_width * (num_metrics - 1) / 2 for r in range(num_models)], models)
    # show grid x axis
    plt.grid(axis="y", linestyle="--", alpha=0.6)
    # Add legend
    plt.legend(loc="upper center", bbox_to_anchor=(0.5, 1.2), ncol=4, fontsize=9)

    # Add value labels on the bars
    # for i, (metric, values) in enumerate(models_means.items()):
    #     for j, v in enumerate(values):
    #         ax.text(r[j] + bar_width * i, v, f"{v:.2f}", ha="center", va="bottom")

    # Adjust layout and display the plot
    plt.tight_layout()
    plt.savefig("../assets/training_performance.png", dpi=300, bbox_inches="tight")
    plt.show()


models = ["Decision Tree", "Log. Regression", "Neural Network", "Random Forest", "XGBoost"]
models_means = {"bal_acc": plot_acc, "ppv": plot_ppv, "trp": plot_tpr, "f1_macro": plot_f1}
models_erros = {"bal_acc": plot_acc_ci, "ppv": plot_ppv_ci, "trp": plot_tpr_ci, "f1_macro": plot_f1_ci}
plot_model_metrics(models, models_means, models_erros)