Scatter plot fairness vs accuracy

In [None]:
import gc
import os
import warnings
from os import path
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


warnings.filterwarnings('ignore')

In [None]:
sample_size = 56
calculations_dir = path.join('out', 'calculations', f'n{sample_size}')
plots_dir = path.join('out', 'plots', f'n{sample_size}', 'scatter')
os.makedirs(plots_dir, exist_ok=True)

metrics = {
    'acc_equality_diff.bin': 'Accuracy equality',
    'equal_opp_diff.bin': 'Equal opportunity',
    'pred_equality_diff.bin': 'Predictive equality',
    'stat_parity.bin': 'Statistical parity',
    'neg_pred_parity_diff.bin': 'Negative predictive parity',
    'pos_pred_parity_diff.bin': 'Positive predictive parity',
}

In [None]:
def get_accuracy():
    data_cols = [
        'i_tp',     # minority true positive
        'i_fp',     # minority false positive
        'i_tn',     # minority true negative
        'i_fn',     # minority false negative
        'j_tp',     # majority true positive
        'j_fp',     # majority false positive
        'j_tn',     # majority true negative
        'j_fn',     # majority false negative
    ]
    sample_size = 56
    dataset_path = path.join('out', f'Set(08,{sample_size}).bin')
    with open(dataset_path, "rb") as f:
        df = pd.DataFrame(pickle.load(f), columns=data_cols)

    acc = (df['i_tp'] + df['i_tn'] + df['j_tp'] + df['j_tn']) / sample_size

    del df
    return acc

def get_gmean():
    data_cols = [
        'i_tp',     # minority true positive
        'i_fp',     # minority false positive
        'i_tn',     # minority true negative
        'i_fn',     # minority false negative
        'j_tp',     # majority true positive
        'j_fp',     # majority false positive
        'j_tn',     # majority true negative
        'j_fn',     # majority false negative
    ]
    sample_size = 56
    dataset_path = path.join('out', f'Set(08,{sample_size}).bin')
    with open(dataset_path, "rb") as f:
        df = pd.DataFrame(pickle.load(f), columns=data_cols)

    gm = ((df['i_tp'] + df['j_tp']) * (df['i_tn'] + df['j_tn']) / (df['i_tp'] + df['j_tp'] + df['i_fn'] + df['j_fn']) / (df['i_tn'] + df['j_tn'] + df['i_fp'] + df['j_fp'])).pow(1/2)

    del df
    return gm

In [None]:
# # uncomment if accuracy is not calculated yet

# with open(path.join(calculations_dir, "accuracy.bin"), "wb+") as f:
#     get_accuracy().to_numpy().tofile(f)

# with open(path.join(calculations_dir, "g_mean.bin"), "wb+") as f:
#     get_gmean().to_numpy().tofile(f)

In [None]:
def save_counts(accuracy, metric_file, metric_name, acc_name='accuracy'):
    with open(path.join(calculations_dir, metric_file), 'rb') as f:
        df = pd.concat([
            accuracy,
            pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[metric_name])
        ], axis=1)

    df = df.groupby([acc_name, metric_name]).size().reset_index(name='count')

    fn = path.join(calculations_dir, f'counts_{acc_name}_vs_{metric_file.replace(".bin", ".csv")}')
    with open(fn, 'w+') as f:
        df.to_csv(f, index=False)
    print(fn)
    del df

In [None]:
# scatter with point size

def scatter(metric_file, metric_name, acc_name='accuracy'):
    with open(path.join(calculations_dir, f'counts_{acc_name}_vs_{metric_file.replace(".bin", ".csv")}'), 'r') as f:
        df = pd.read_csv(f)

    fig, ax = plt.subplots(figsize=(9, 8))
    ax.scatter(
        df[acc_name],
        df[metric_name],
        # s=np.log2(df['count']),
        s=np.log2(df['count'] / 10),
        alpha=.1,
        lw=0,
    )
    ax.set_xlabel(acc_name)
    ax.set_ylabel(metric_name)
    ax.set_title('v2')
    plt.tight_layout()
    fig.savefig(path.join(plots_dir, f'scatter_{acc_name}_vs_{metric_name}.png'), dpi=300)
    plt.close(fig)


In [None]:
def heatmap(metric_file, metric_name, acc_name='accuracy'):
    # grouped by rounding

    with open(os.path.join(calculations_dir,
                        f'counts_{acc_name}_vs_{metric_file.replace(".bin", ".csv")}'),
              'r') as f:
        df = pd.read_csv(f)

    df[acc_name] = df[acc_name].round(2)
    df[metric_name] = df[metric_name].round(2)
    df = df.groupby([acc_name, metric_name]).sum().reset_index().pivot(index=acc_name, columns=metric_name, values='count')

    fig, ax = plt.subplots(figsize=(9, 8))
    sns.heatmap(
        df,
        annot=False,
        cbar_kws={'label': 'Count'},
        cmap='cividis',
        # alpha=.1,
        ax=ax
    )
    ax.invert_yaxis()
    ax.set_xlabel(metric_name)
    ax.set_ylabel(acc_name)

    ax.set_title(f'{metric_name} vs {acc_name}')
    plt.tight_layout()
    fig.savefig(os.path.join(plots_dir, f'hm_v2_{acc_name}_vs_{metric_name}.png'), dpi=300)

In [None]:
# # read the metric values from previously saved files

with open(path.join(calculations_dir, 'accuracy.bin'), 'rb') as f:
    acc = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['accuracy'])

# with open(path.join(calculations_dir, 'g_mean.bin'), 'rb') as f:
#     g_mean = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['g_mean'])

In [None]:
# plot_f = scatter
plot_f = heatmap

for mf, mn in metrics.items():
    # save_counts(g_mean, mf, mn, 'g_mean')
    plot_f(mf, mn, 'acc')
    gc.collect()

In [None]:
# g_mean vs accuracy
save_counts(acc, 'g_mean.bin', 'g_mean', 'accuracy')