# Statistical Tests

In [None]:
import warnings
from scipy.stats import ttest_ind, mannwhitneyu
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

In [None]:
task_data = "income"

# scores loading

DATA_PATH = "../../src/data/evaluation"
TEST_PATH = f"../../src/data/acs_{task_data}/processed/acs_{task_data}_test.csv"

BASELINE = f"{DATA_PATH}/baseline/{task_data}"
SEPARATION = f"{DATA_PATH}/hardt2016/{task_data}"
INDENPENDENCE = f"{DATA_PATH}/kamiran_calders2012/{task_data}"
SUFFICIENCY = f"{DATA_PATH}/pleiss2017/{task_data}/calib_weighted"

base_pred = pd.read_csv(f"{BASELINE}/XGBClassifier_predictions.csv")
sep_pred = pd.read_csv(f"{SEPARATION}/XGBClassifier_separation_predictions.csv")
ind_pred = pd.read_csv(f"{INDENPENDENCE}/XGBClassifier_independence_predictions.csv")
suf_pred = pd.read_csv(f"{SUFFICIENCY}/XGBClassifier_sufficiency_predictions.csv")

base_scores = np.load(f"{BASELINE}/XGBClassifier_scores.npy", allow_pickle=True).item()
base_scores_cond = np.load(f"{BASELINE}/XGBClassifier_conditional_scores.npy", allow_pickle=True).item()

sep_scores = np.load(f"{SEPARATION}/XGBClassifier_scores_separation.npy", allow_pickle=True).item()
sep_scores_cond = np.load(f"{SEPARATION}/XGBClassifier_conditional_scores_separation.npy", allow_pickle=True).item()

ind_scores = np.load(f"{INDENPENDENCE}/XGBClassifier_scores_independence.npy", allow_pickle=True).item()
ind_scores_cond = np.load(
    f"{INDENPENDENCE}/XGBClassifier_conditional_scores_independence.npy", allow_pickle=True
).item()

suf_scores = np.load(f"{SUFFICIENCY}/XGBClassifier_scores_sufficiency.npy", allow_pickle=True).item()
suf_scores_cond = np.load(f"{SUFFICIENCY}/XGBClassifier_conditional_scores_sufficiency.npy", allow_pickle=True).item()

df_test = pd.read_csv(TEST_PATH)

In [None]:
# data loading as dataframes

df_base = pd.DataFrame.from_dict(base_scores, orient="index")
df_base_cond = pd.DataFrame.from_dict(base_scores_cond, orient="index")

df_sep = pd.DataFrame.from_dict(sep_scores, orient="index")
df_sep_cond = pd.DataFrame.from_dict(sep_scores_cond, orient="index")

df_ind = pd.DataFrame.from_dict(ind_scores, orient="index")
df_ind_cond = pd.DataFrame.from_dict(ind_scores_cond, orient="index")

df_suf = pd.DataFrame.from_dict(suf_scores, orient="index")
df_suf_cond = pd.DataFrame.from_dict(suf_scores_cond, orient="index")

In [None]:
df_base.columns

In [None]:
def get_confidence_interval(scores):
    from scipy import stats

    mean = scores.mean()
    sem = stats.sem(scores)
    ci = stats.t.interval(0.95, len(scores) - 1, loc=mean, scale=sem)
    return ci

In [None]:
fairness_metrics = ["AVG_ODDS_DIFF", "STAT_PAR_DIFF", "AVG_PRED_VALUE_DIFF", "EQ_OPP_DIFF", "FDR"]
for metric in fairness_metrics:
    print(metric)

    metric_mean = df_base[metric].mean()
    ci = get_confidence_interval(df_base[metric])
    print(f"Baseline: {round(metric_mean, 3)} $\pm$ {round(np.abs(metric_mean - ci[0]), 4)}")

    metric_mean = df_sep[metric].mean()
    ci = get_confidence_interval(df_sep[metric])
    print(f"Separation: {round(metric_mean, 3)} $\pm$ {round(np.abs(metric_mean - ci[0]), 4)}")

    metric_mean = df_ind[metric].mean()
    ci = get_confidence_interval(df_ind[metric])
    print(f"Independence: {round(metric_mean, 3)} $\pm$ {round(np.abs(metric_mean - ci[0]), 4)}")

    metric_mean = df_suf[metric].mean()
    ci = get_confidence_interval(df_suf[metric])
    print(f"Sufficiency: {round(metric_mean, 3)} $\pm$ {round(np.abs(metric_mean - ci[0]), 4)}")
    print("\n")

In [None]:
# Perfomance of the baseline evaluation
# performance = ["BAL_ACC", "PPV", "TPR", "F1_MACRO"]
performance = ["BAL_ACC", "F1_MACRO"]
for metric in performance:
    metric_mean = df_base[metric].mean()
    ci = get_confidence_interval(df_base[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])

    print(
        f"Baseline {metric}: {metric_mean:.3f} $\pm$ {lower_ci:.3f}",
    )

In [None]:
# Perfomance of the separation evaluation - post process
# performance = ["BAL_ACC", "PPV", "TPR", "F1_MACRO"]
performance = ["BAL_ACC", "F1_MACRO"]

for metric in performance:
    metric_mean = df_sep[metric].mean()
    ci = get_confidence_interval(df_sep[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])

    print(
        "SEPARATION - hardt2016 - PostProcess",
        f"{metric},  mean: {metric_mean:.3f} $\pm$ {lower_ci:.3f}",
    )

In [None]:
# Perfomance of the independence evaluation - pre process
# performance = ["BAL_ACC", "PPV", "TPR", "F1_MACRO"]
performance = ["BAL_ACC", "F1_MACRO"]

for metric in performance:
    metric_mean = df_ind[metric].mean()
    ci = get_confidence_interval(df_ind[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])

    print(
        "INDEPENDENCE - kamiran_calders2012 - PreProcess",
        f"{metric},  mean: {metric_mean:.3f} $\pm$ {lower_ci:.3f}",
    )

In [None]:
# Perfomance of the sufficiency evaluation - post process
# performance = ["BAL_ACC", "PPV", "TPR", "F1_MACRO"]
performance = ["BAL_ACC", "F1_MACRO"]

for metric in performance:
    metric_mean = df_suf[metric].mean()
    ci = get_confidence_interval(df_suf[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])

    print(
        "SUFFICIENCY - pleiss2017 - PostProcess",
        f"{metric},  mean: {metric_mean:.3f} $\pm$ {lower_ci:.3f}",
    )

# U-test, p-values

In [None]:
# check significance values for the fairness metrics
def p_value_test(x, y):
    t_test = ttest_ind(x, y)
    # Mann-Whitney U tests for both sets of values
    mannwhitney = mannwhitneyu(x, y)

    print(f"T-test p-value: {t_test.pvalue:.30f}")
    print(f"Mann-Whitney U-test p-value: {mannwhitney.pvalue:.30f}")
    return t_test.pvalue, mannwhitney.pvalue


def significance_level(p_value):

    if p_value <= 0.0001:
        return "**** Highly significant difference"
    elif 0.0001 < p_value <= 0.001:
        return "*** Highly significant difference"
    elif 0.001 < p_value <= 0.01:
        return "** Moderately significant difference"
    elif 0.01 < p_value <= 0.05:
        return "* Significant difference"
    else:
        return "n.s. Not significant"


def _significance_level(x, y):
    U, p_value = mannwhitneyu(x, y)
    print(f"U: {U}, p-value: {p_value}")

    levels = [
        (0.0001, "**** Extremely significant difference"),
        (0.001, "*** Highly significant difference"),
        (0.01, "** More significant difference"),
        (0.05, "* Significant difference"),
    ]

    for threshold, label in levels:
        if p_value <= threshold:
            return label

    return "Not significant"

## Overall test for all scenarios

In [None]:
print("ACCURACY SCORES: MALES X FEMALES")

print("BASELINE")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_ACC"], df_base_cond["PRIV_ACC"])
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\nSEPARATION")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_sep_cond["UNP_ACC"], df_sep_cond["PRIV_ACC"])
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\nINDEPENDENCE")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_ind_cond["UNP_ACC"], df_ind_cond["PRIV_ACC"])
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\nSUFFICIENCY")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_suf_cond["UNP_ACC"], df_suf_cond["PRIV_ACC"])
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base["F1_MACRO"], df_sep["F1_MACRO"])
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base["F1_MACRO"], df_ind["F1_MACRO"])
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base["F1_MACRO"], df_suf["F1_MACRO"])
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

### SEPARATION

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base["AVG_ODDS_DIFF"], df_sep["AVG_ODDS_DIFF"])

print("Baseline X Ind AVG_ODDS_DIFF")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base["EQ_OPP_DIFF"], df_sep["EQ_OPP_DIFF"])

print("Baseline X Ind EQ_ODDS_DIFF")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_FNR"], df_base_cond["PRIV_FNR"])

print("Baseline: Males x Female FNR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n##############################################")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_sep_cond["UNP_FNR"], df_sep_cond["PRIV_FNR"])

print("Baseline: Males x Female FNR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_FPR"], df_base_cond["PRIV_FPR"])

print("Baseline: Males x Female FPR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n##############################################")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_sep_cond["UNP_FPR"], df_sep_cond["PRIV_FPR"])

print("Separation (Thre.Opt): Males x Female FPR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_TPR"], df_base_cond["PRIV_TPR"])

print("Separation (Thre.Opt): Males x Female TPR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n##############################################")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_sep_cond["UNP_TPR"], df_sep_cond["PRIV_TPR"])

print("Separation (Thre.Opt): Males x Female TPR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_TNR"], df_base_cond["PRIV_TNR"])

print("Separation (Thre.Opt): Males x Female TNR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n##############################################")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_sep_cond["UNP_TNR"], df_sep_cond["PRIV_TNR"])

print("Separation (Thre.Opt): Males x Female TNR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
# Perfomance of the separation evaluation - post process
# fmt: off
conditional_rates_per_group = [
    # "UNP_TPR", "PRIV_TPR",
    # "UNP_TNR", "PRIV_TNR",
    "UNP_FPR", "PRIV_FPR",
    "UNP_FNR", "PRIV_FNR"
]
# fmt: on

for metric in conditional_rates_per_group:
    group = "females" if metric.startswith("UNP_") else "males"

    metric_mean = df_base_cond[metric].mean()
    ci = get_confidence_interval(df_base_cond[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])
    print(f"BASELINE: {metric} - {group}: {metric_mean:.3f} $\pm$ {lower_ci:.3f}")

    metric_mean = df_sep_cond[metric].mean()
    ci = get_confidence_interval(df_sep_cond[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])
    print(f"SEPARATION - Threshold Opt.: {metric} - {group}: {metric_mean:.3f} $\pm$ {lower_ci:.3f}")
    
    print("\n")

### INDEPENDENCE

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base["STAT_PAR_DIFF"], df_ind["STAT_PAR_DIFF"])

print("Baseline X Ind STAT_PAR_DIFF")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_TPR"], df_base_cond["PRIV_TPR"])

print("Baseline: Males x Female TPR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n##############################################")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_ind_cond["UNP_TPR"], df_ind_cond["PRIV_TPR"])

print("Indenpendece: Males x Female TPR")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
# INDEPENDENCE
conditional_rates_per_group = ["UNP_TPR", "PRIV_TPR"]

for metric in conditional_rates_per_group:
    group = "females" if metric.startswith("UNP_") else "males"

    metric_mean = df_base_cond[metric].mean()
    ci = get_confidence_interval(df_base_cond[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])
    print(f"BASELINE: {metric} - {group}: {metric_mean:.3f} $\pm$ {lower_ci:.3f}")

    metric_mean = df_ind_cond[metric].mean()
    ci = get_confidence_interval(df_ind_cond[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])
    print(f"INDEPENDENCE - Reweighing: {metric} - {group}: {metric_mean:.3f} $\pm$ {lower_ci:.3f}")

    print("\n")

### SUFFICIENCY

In [None]:
# Perfomance of the separation evaluation - post process
# fmt: off
conditional_rates_per_group = [
    "UNP_PPV", "PRIV_PPV",
    "UNP_NPV", "PRIV_NPV"
]
# fmt: on

for metric in conditional_rates_per_group:
    group = "females" if metric.startswith("UNP_") else "males"

    metric_mean = df_base_cond[metric].mean()
    ci = get_confidence_interval(df_base_cond[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])
    print(f"BASELINE: {metric} - {group}: {metric_mean:.4f} $\pm$ {lower_ci:.3f}")

    metric_mean = df_suf_cond[metric].mean()
    ci = get_confidence_interval(df_suf_cond[metric])
    lower_ci = np.abs(metric_mean - ci[0])
    upper_ci = np.abs(metric_mean - ci[1])
    print(f"SUFFICIENCY - Calibration.: {metric} - {group}: {metric_mean:.4f} $\pm$ {lower_ci:.3f}")
    
    print("\n")

In [None]:
# Diferença não significante, que não alterou de forma significativa os falsos positivos e negativos do modelo. 
# Mantendo a calibração já existente do modelo. 

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_PPV"], df_base_cond["PRIV_PPV"])

print("Baseline: Males x Female")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n##############################################")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_suf_cond["UNP_PPV"], df_suf_cond["PRIV_PPV"])

print("Suffiency: Males x Female")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_FOR"], df_base_cond["PRIV_FOR"])

print("Baseline: Males x Female")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n##############################################")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_suf_cond["UNP_FOR"], df_suf_cond["PRIV_FOR"])

print("Suffiency: Males x Female")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_FDR"], df_base_cond["PRIV_FDR"])

print("Baseline: Males x Female")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n##############################################")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_suf_cond["UNP_FDR"], df_suf_cond["PRIV_FDR"])

print("Suffiency: Males x Female")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

In [None]:
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_base_cond["UNP_NPV"], df_base_cond["PRIV_NPV"])

print("Baseline: Males x Female")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))

print("\n##############################################")
t_test_pvalue, mannwhitney_pvalue = p_value_test(df_suf_cond["UNP_NPV"], df_suf_cond["PRIV_NPV"])

print("Suffiency: Males x Female")
print("t-test significance level:", significance_level(t_test_pvalue))
print("mannwhitney significance level:", significance_level(mannwhitney_pvalue))