In [15]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from matplotlib import cbook

warnings.filterwarnings("ignore")
%matplotlib inline

task_data = "income"

In [16]:
# scores loading
import numpy as np
import pandas as pd

DATA_PATH = "../../src/data/evaluation"
TEST_PATH = f"../../src/data/acs_{task_data}/processed/acs_{task_data}_test.csv"

BASELINE = f"{DATA_PATH}/baseline/{task_data}"
SEPARATION = f"{DATA_PATH}/hardt2016/{task_data}"
INDENPENDENCE = f"{DATA_PATH}/kamiran_calders2012/{task_data}"
SUFFICIENCY = f"{DATA_PATH}/pleiss2017/{task_data}/calib_weighted"

base_pred = pd.read_csv(f"{BASELINE}/XGBClassifier_predictions.csv")
sep_pred = pd.read_csv(f"{SEPARATION}/XGBClassifier_separation_predictions.csv")
ind_pred = pd.read_csv(f"{INDENPENDENCE}/XGBClassifier_independence_predictions.csv")
suf_pred = pd.read_csv(f"{SUFFICIENCY}/XGBClassifier_sufficiency_predictions.csv")

base_scores = np.load(f"{BASELINE}/XGBClassifier_scores.npy", allow_pickle=True).item()
base_scores_cond = np.load(f"{BASELINE}/XGBClassifier_conditional_scores.npy", allow_pickle=True).item()

sep_scores = np.load(f"{SEPARATION}/XGBClassifier_scores_separation.npy", allow_pickle=True).item()
sep_scores_cond = np.load(f"{SEPARATION}/XGBClassifier_conditional_scores_separation.npy", allow_pickle=True).item()

ind_scores = np.load(f"{INDENPENDENCE}/XGBClassifier_scores_independence.npy", allow_pickle=True).item()
ind_scores_cond = np.load(
    f"{INDENPENDENCE}/XGBClassifier_conditional_scores_independence.npy", allow_pickle=True
).item()

suf_scores = np.load(f"{SUFFICIENCY}/XGBClassifier_scores_sufficiency.npy", allow_pickle=True).item()
suf_scores_cond = np.load(f"{SUFFICIENCY}/XGBClassifier_conditional_scores_sufficiency.npy", allow_pickle=True).item()

df_test = pd.read_csv(TEST_PATH)

In [17]:
# data loading as dataframes

df_base = pd.DataFrame.from_dict(base_scores, orient="index")
df_base_cond = pd.DataFrame.from_dict(base_scores_cond, orient="index")

df_ind = pd.DataFrame.from_dict(ind_scores, orient="index")
df_ind_cond = pd.DataFrame.from_dict(ind_scores_cond, orient="index")


def get_confidence_interval(scores):
    from scipy import stats

    mean = scores.mean()
    sem = stats.sem(scores)
    ci = stats.t.interval(0.95, len(scores) - 1, loc=mean, scale=sem)
    return ci

In [None]:
mpl.rcParams["figure.dpi"] = 100
colors = plt.get_cmap("Dark2")

box_colors = plt.get_cmap("Set3")
box_colors

In [None]:
labels = ["baseline", "reweighing"]
legend_labels = ["Females", "Males"]

ax1_data_lists = [
    (df_base_cond["UNP_TPR"].values, df_base_cond["PRIV_TPR"].values),
    (df_ind_cond["UNP_TPR"].values, df_ind_cond["PRIV_TPR"].values),
]

ax2_data_lists = [
    (df_base_cond["UNP_ACC"].values, df_base_cond["PRIV_ACC"].values),
    (df_ind_cond["UNP_ACC"].values, df_ind_cond["PRIV_ACC"].values),
]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4.3))

num_groups = len(ax1_data_lists)
group_width = 1
box_width = group_width / 4
positions = np.arange(num_groups)

box_properties = {
    "patch_artist": True,
    "showfliers": False,
    "medianprops": {"color": "black"},
    "whiskerprops": {"color": "black"},
    "capprops": {"color": "black"},
    "flierprops": {"markeredgecolor": "black"},
}

for i, (data1, data2) in enumerate(ax1_data_lists):

    # generate stats for the boxplot
    stats1 = cbook.boxplot_stats([data1], labels=[labels[0]])
    stats2 = cbook.boxplot_stats([data2], labels=[labels[1]])

    # Plot the boxplot statistics using bxp
    bp1 = ax1.bxp(stats1, positions=[positions[i] - box_width / 2], widths=box_width, **box_properties)
    bp2 = ax1.bxp(stats2, positions=[positions[i] + box_width / 2], widths=box_width, **box_properties)

    for patch in bp1["boxes"]:
        patch.set_facecolor(box_colors(3))
    for patch in bp2["boxes"]:
        patch.set_facecolor(box_colors(4))

    # Adding the significance bar
    x1 = positions[i] - box_width / 2 - 0.02
    x2 = positions[i] + box_width / 2 + 0.02
    y, h, color = 0.20, 0.002, "#3D3D3D"  # Adjust these values based on your plot scale

    # Plot the significance bar
    # ax1.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.0, c=color)
    # # ax1.text((x1 + x2) * 0.5, y + h, "**", ha="center", va="bottom", color="black")
    # if i == 0:
    #     ax1.text((x1 + x2) * 0.5, y + h, "***", ha="center", va="bottom", color="black")
    # else:
    #     ax1.text((x1 + x2) * 0.5, y + h, "***", ha="center", va="bottom", color="black")

    # Add separator line between pairs, except after the last pair
    if i < num_groups - 1:
        ax1.axvline(x=positions[i] + 0.5, color="gray", linestyle="--", alpha=0.7)

# ax1.set_yticks([0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.21])
# ax1.set_ylim(0.08, 0.21)
ax1.set_xticks(positions)
ax1.set_xticklabels(labels)
ax1.yaxis.grid(True)
ax1.set_ylabel("error rates scale")
ax1.set_title("True positive rates across groups", fontsize=10)


for i, (data1, data2) in enumerate(ax2_data_lists):
    # generate stats for the boxplot
    stats1 = cbook.boxplot_stats([data1], labels=[labels[0]])
    stats2 = cbook.boxplot_stats([data2], labels=[labels[1]])

    # Plot the boxplot statistics using bxp
    bp1 = ax2.bxp(stats1, positions=[positions[i] - box_width / 2], widths=box_width, **box_properties)
    bp2 = ax2.bxp(stats2, positions=[positions[i] + box_width / 2], widths=box_width, **box_properties)

    for patch in bp1["boxes"]:
        patch.set_facecolor(box_colors(3))
    for patch in bp2["boxes"]:
        patch.set_facecolor(box_colors(4))

    # Adding the significance bar
    x1 = positions[i] - box_width / 2 - 0.02
    x2 = positions[i] + box_width / 2 + 0.02
    y, h, color = 0.39, 0.002, "#3D3D3D"  # Adjust these values based on your plot scale

    # Plot the significance bar
    # ax2.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.0, c=color)
    # # ax2.text((x1 + x2) * 0.5, y + h, "**", ha="center", va="bottom", color="black")
    # if i == 0:
    #     ax2.text((x1 + x2) * 0.5, y + h, "***", ha="center", va="bottom", color="black")
    # else:
    #     ax2.text((x1 + x2) * 0.5, y + h, "**", ha="center", va="bottom", color="black")

    if i < num_groups - 1:
        ax2.axvline(x=positions[i] + 0.5, color="gray", linestyle="--", alpha=0.7)

ax2.set_xticks(positions)
ax2.set_xticklabels(labels)
ax2.yaxis.grid(True)
ax2.set_ylabel("error rates scale")
ax2.set_title("Accuracy across groups", fontsize=10)

ax2.legend(labels=legend_labels, bbox_to_anchor=(1.04, 1), title="sensitive groups", loc="upper left")
plt.tight_layout()
# plt.savefig("../assets/boxplot_ind_true_rates.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
labels = ["baseline", "reweighing"]
legend_labels = ["Females", "Males"]

ax1_tpr_data_lists = [
    (df_base_cond["UNP_TPR"].values, df_base_cond["PRIV_TPR"].values),
    (df_ind_cond["UNP_TPR"].values, df_ind_cond["PRIV_TPR"].values),
]

fig, ax1 = plt.subplots(figsize=(5, 4.5))

num_groups = len(ax1_tpr_data_lists)
group_width = 1
box_width = group_width / 4
positions = np.arange(num_groups)

box_properties = {
    "patch_artist": True,
    "showfliers": False,
    "medianprops": {"color": "black"},
    "whiskerprops": {"color": "black"},
    "capprops": {"color": "black"},
    "flierprops": {"markeredgecolor": "black"},
}

for i, (data1, data2) in enumerate(ax1_tpr_data_lists):

    # generate stats for the boxplot
    stats1 = cbook.boxplot_stats([data1], labels=[labels[0]])
    stats2 = cbook.boxplot_stats([data2], labels=[labels[1]])

    # Plot the boxplot statistics using bxp
    bp1 = ax1.bxp(stats1, positions=[positions[i] - box_width / 2], widths=box_width, **box_properties)
    bp2 = ax1.bxp(stats2, positions=[positions[i] + box_width / 2], widths=box_width, **box_properties)

    for patch in bp1["boxes"]:
        patch.set_facecolor(box_colors(3))
    for patch in bp2["boxes"]:
        patch.set_facecolor(box_colors(4))

    # Adding the significance bar
    x1 = positions[i] - box_width / 2 - 0.02
    x2 = positions[i] + box_width / 2 + 0.02
    y, h, color = 0.77, 0.002, "#3D3D3D"  # Adjust these values based on your plot scale

    # Plot the significance bar
    ax1.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.0, c=color)
    # ax1.text((x1 + x2) * 0.5, y + h, "**", ha="center", va="bottom", color="black")
    if i == 0:
        ax1.text((x1 + x2) * 0.5, y + h, "***", ha="center", va="bottom", color="black")
    else:
        ax1.text((x1 + x2) * 0.5, y + h, "***", ha="center", va="bottom", color="black")

    # Add separator line between pairs, except after the last pair
    if i < num_groups - 1:
        ax1.axvline(x=positions[i] + 0.5, color="gray", linestyle="--", alpha=0.7)

# ax1.set_yticks([0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.21])
# ax1.set_ylim(0.08, 0.21)
ax1.set_xticks(positions)
ax1.set_xticklabels(labels)
ax1.yaxis.grid(True)
ax1.set_ylabel("error rates scale")
ax1.set_title("True positive rates across groups", fontsize=10)

from matplotlib.lines import Line2D

legend_elements = [
    Line2D([0], [0], marker="None", color="none", lw=1.0, label="* p < 0.05"),
    Line2D([0], [0], marker="None", color="none", lw=1.0, label="** p < 0.01"),
    Line2D([0], [0], marker="None", color="none", lw=1.0, label="*** p < 0.001"),
]

# Adding the custom legend to the plot
first_legend = ax1.legend(labels=legend_labels, bbox_to_anchor=(1.04, 1), title="sensitive groups", loc="upper left")
# Add the second legend to the same axis, but outside of the plot
second_legend = ax1.legend(
    handles=legend_elements,
    loc="upper left",
    bbox_to_anchor=(1.05, 0.75),
    ncol=1,
    title="significance levels",
    fontsize=8,
)
ax1.add_artist(first_legend)

plt.tight_layout()
plt.savefig("../assets/boxplot_tpr.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
mpl.rcParams["figure.dpi"] = 100
colors = plt.get_cmap("Dark2")

labels = ["baseline", "reweighing"]
spd_lists = [df_base["STAT_PAR_DIFF"].values, df_ind["STAT_PAR_DIFF"].values]

fig, ax = plt.subplots(figsize=(5, 4.5))

num_groups = len(spd_lists)
group_width = 1
box_width = group_width / 3
positions = np.arange(num_groups)

box_properties = {
    "patch_artist": True,
    "showfliers": False,
    "medianprops": {"color": "black"},
    "whiskerprops": {"color": "black"},
    "capprops": {"color": "black"},
    "flierprops": {"markeredgecolor": "black"},
}

print(spd_lists[0], spd_lists[1])
# independence
stats1 = cbook.boxplot_stats([spd_lists[0]], labels=["base"])
stats2 = cbook.boxplot_stats([spd_lists[1]], labels=["ind"])
bp1 = ax.bxp(stats1, positions=[positions[i] - box_width / 2], widths=box_width, **box_properties)
bp2 = ax.bxp(stats2, positions=[positions[i] + box_width / 2], widths=box_width, **box_properties)

for patch in bp1["boxes"]:
    patch.set_facecolor(colors(0))
for patch in bp2["boxes"]:
    patch.set_facecolor(colors(2))

# ax.set_xticks([])
ax.yaxis.grid(True)
# ax.set_xlabel("reweighting")
ax.set_ylabel("error rates scale")
ax.set_title("statistical parity difference", fontsize=10)

# Adding the significance bar
x1 = positions[i] - box_width / 2 - 0.02
x2 = positions[i] + box_width / 2 + 0.02

# Plot the significance bar
y_max = max(spd_lists[0].max(), spd_lists[1].max())
y_max = y_max + 0.003
y, h, color = y_max, 0.003, "#3D3D3D"  # Adjust these values based on your plot scale
ax.plot([x1, x1, x2, x2], [y, y + h, y + h, y], lw=1.0, c=color)
ax.text((x1 + x2) * 0.5, y + h, "***", ha="center", va="bottom", color="black")

# Adding the custom legend to the plot
first_legend = ax.legend(labels=labels, bbox_to_anchor=(1.04, 1), title="intervention", loc="upper left")

text_properties = {
    "horizontalalignment": "right",
    "verticalalignment": "top",
    "color": "red",
    # "fontweight": "bold",
}
# Add annotations
ax.text(-0.01, 1.02, "fair", transform=ax.transAxes, **text_properties)
ax.text(-0.01, 0.05, "unfair", transform=ax.transAxes, **text_properties)

from matplotlib.lines import Line2D

legend_elements = [
    Line2D([0], [0], marker="None", color="none", lw=1.0, label="* p < 0.05"),
    Line2D([0], [0], marker="None", color="none", lw=1.0, label="** p < 0.01"),
    Line2D([0], [0], marker="None", color="none", lw=1.0, label="*** p < 0.001"),
]

# Add the second legend to the same axis, but outside of the plot
second_legend = ax.legend(
    handles=legend_elements,
    loc="upper left",
    bbox_to_anchor=(1.05, 0.75),
    ncol=1,
    title="significance levels",
    fontsize=8,
)
ax.add_artist(first_legend)
# ax.add_artist(second_legend)

plt.tight_layout()
# plt.savefig("../assets/boxplot_spd.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# INDEPENDENCE
IND_METRIC = ["STAT_PAR_DIFF"]
bl_stpar = df_base[IND_METRIC]
bl_stpar_mean = bl_stpar.mean()
bl_stpar_ci = get_confidence_interval(bl_stpar)

ind_stpar = df_ind[IND_METRIC]
ind_stpar_mean = ind_stpar.mean()
ind_stpar_ci = get_confidence_interval(ind_stpar)

fig, ax = plt.subplots(figsize=(6, 3))
bar_width = 0.3

# Define positions for the bars to keep them close together
y_pos_bl = 0  # y position for the "bl" bar
y_pos_ind = y_pos_bl + 0.4  # y position for the "sep" bar

bl_error = [bl_stpar_mean - bl_stpar_ci[0], bl_stpar_ci[1] - bl_stpar_mean]
ind_error = [ind_stpar_mean - ind_stpar_ci[0], ind_stpar_ci[1] - ind_stpar_mean]

ax.barh(y_pos_bl, bl_stpar_mean, xerr=bl_error, color=colors(0), capsize=5, label="Baseline", height=bar_width)
ax.barh(y_pos_ind, ind_stpar_mean, xerr=ind_error, color=colors(2), capsize=5, label="Reweighing", height=bar_width)
ax.set_yticks([])
ax.legend()

text_properties = {
    "horizontalalignment": "right",
    "verticalalignment": "top",
    "color": "red",
    "fontweight": "bold",
}

# Add annotations
# Add annotations
ax.text(0.02, -0.1, "unfair", transform=ax.transAxes, **text_properties)
ax.text(1.02, -0.1, "fair", transform=ax.transAxes, **text_properties)

ax.set_xlabel("fairness scale")
ax.set_ylabel("statistical parity difference")
ax.invert_yaxis()

# plt.savefig("../assets/independence_fairness_measure.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
IND_METRIC = ["EQ_OPP_DIFF"]
bl_stpar = df_base[IND_METRIC]
bl_stpar_mean = bl_stpar.mean()
bl_stpar_ci = get_confidence_interval(bl_stpar)

ind_stpar = df_ind[IND_METRIC]
ind_stpar_mean = ind_stpar.mean()
ind_stpar_ci = get_confidence_interval(ind_stpar)

fig, ax = plt.subplots(figsize=(6, 3))
bar_width = 0.3

# Define positions for the bars to keep them close together
y_pos_bl = 0  # y position for the "bl" bar
y_pos_ind = y_pos_bl + 0.4  # y position for the "sep" bar

bl_error = [bl_stpar_mean - bl_stpar_ci[0], bl_stpar_ci[1] - bl_stpar_mean]
ind_error = [ind_stpar_mean - ind_stpar_ci[0], ind_stpar_ci[1] - ind_stpar_mean]

ax.barh(y_pos_bl, bl_stpar_mean, xerr=bl_error, color=colors(0), capsize=5, label="Baseline", height=bar_width)
ax.barh(y_pos_ind, ind_stpar_mean, xerr=ind_error, color=colors(2), capsize=5, label="Reweighing", height=bar_width)
ax.set_yticks([])
ax.legend()

text_properties = {
    "horizontalalignment": "right",
    "verticalalignment": "top",
    "color": "red",
    "fontweight": "bold",
}

# Add annotations
# Add annotations
ax.text(0.02, -0.1, "unfair", transform=ax.transAxes, **text_properties)
ax.text(1.02, -0.1, "fair", transform=ax.transAxes, **text_properties)

ax.set_xlabel("fairness scale")
ax.set_ylabel("equal opportunity difference")
ax.invert_yaxis()

# plt.savefig("../assets/independence_fairness_measure_2.png", dpi=300, bbox_inches="tight")
plt.show()