In [None]:
import gc
import os
import warnings
from os import path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')
plt.style.use('default')

In [None]:
sample_size = 56        # 24, 56
plots_dir = os.path.join('out', 'plots', f'n{sample_size}', 'ppf_heatmap')
calculations_dir = os.path.join('out', 'calculations', f'n{sample_size}')


os.makedirs(plots_dir, exist_ok=True)
os.makedirs(calculations_dir, exist_ok=True)

diff_metrics = {    # { file: metric name }
    'acc_equality_diff.bin': 'Accuracy equality difference',
    'equal_opp_diff.bin': 'Equal opportunity difference',
    'neg_pred_parity_diff.bin': 'Negative predictive parity difference',
    'pos_pred_parity_diff.bin': 'Positive predictive parity difference',
    'pred_equality_diff.bin': 'Predictive equality difference',
    'stat_parity.bin': 'Statistical parity'
}
ratio_metrics = {
    'acc_equality_ratio.bin': 'Accuracy equality ratio',
    'disp_impact.bin': 'Disparate impact',
    'equal_opp_ratio.bin': 'Equal opportunity ratio',
    'neg_pred_parity_ratio.bin': 'Negative predictive parity ratio',
    'pos_pred_parity_ratio.bin': 'Positive predictive parity ratio',
    'pred_equality_ratio.bin': 'Predictive equality ratio',
}

In [None]:
def calculate_4diff(df, metric, epsilon=0):
    metric_f, metric_n = metric
    print(f'Calculating {metric_n} for ε={epsilon}', end='')
    if epsilon == 0:
        compute_diff_prob = lambda df: np.sum(df['diff'] == 0) / len(df)
    else:
        compute_diff_prob = lambda df: np.sum(np.abs(df['diff']) < epsilon) / len(df)

    with open(path.join(calculations_dir, f'{metric_f}'), 'rb') as f:
        diff_metric = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['diff'])
    print('.', end='')
    df = pd.concat([df, diff_metric], axis=1)
    print('.', end='')

    # group by GR and IR and count probabilities
    diff = df.groupby(['gr', 'ir']).apply(compute_diff_prob).reset_index()
    print('.', end='')

    # save to file
    diff.to_csv(path.join(calculations_dir, f'ppf_2d_{metric_n}_e{epsilon}.csv'), index=False)
    print('.', end='')

    # clean up df
    df.drop(columns=['diff'], inplace=True)
    del diff_metric
    gc.collect()
    print(' done')

In [None]:
%%time

# load GR and IR files
with open(path.join(calculations_dir, 'gr.bin'), 'rb') as f:
    df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['gr'])
with open(path.join(calculations_dir, 'ir.bin'), 'rb') as f:
    df = pd.concat([df,
               pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['ir'])],
              axis=1)
print('Loaded GR and IR')

for metric in diff_metrics.items():
    for epsilon in [0, .01, .02]:
        calculate_4diff(df, metric, epsilon)
        gc.collect()

del df

In [None]:
def calculate_4ratio(df, metric, epsilon=0):
    metric_f, metric_n = metric
    print(f'Calculating {metric_n} for ε={epsilon}', end='')
    if epsilon == 0:
        compute_prob = lambda df: np.sum(df['r'] == 1) / len(df)
    else:
        compute_prob = lambda df: np.sum(np.abs(df['r'] - 1) < epsilon) / len(df)

    with open(path.join(calculations_dir, f'{metric_f}'), 'rb') as f:
        diff_metric = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['r'])
    print('.', end='')
    df = pd.concat([df, diff_metric], axis=1)
    print('.', end='')

    # group by GR and IR and count probabilities
    diff = df.groupby(['gr', 'ir']).apply(compute_prob).reset_index()
    print('.', end='')

    # save to file
    diff.to_csv(path.join(calculations_dir, f'ppf_2d_{metric_n}_e{epsilon}.csv'), index=False)
    print('.', end='')

    # clean up df
    df.drop(columns=['r'], inplace=True)
    del diff_metric
    gc.collect()
    print(' done')

In [None]:
%%time

# load GR and IR files
with open(path.join(calculations_dir, 'gr.bin'), 'rb') as f:
    df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['gr'])
with open(path.join(calculations_dir, 'ir.bin'), 'rb') as f:
    df = pd.concat([df,
                    pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['ir'])],
                   axis=1)
print('Loaded GR and IR')

for metric in ratio_metrics.items():
    for epsilon in [0, .01, .02]:
        calculate_4ratio(df, metric, epsilon)
        gc.collect()

del df

## Plotting heatmaps

In [None]:
def plot(metric, epsilon=0):
    metric_f, metric_n = metric
    # load data
    diff = pd.read_csv(path.join(calculations_dir, f'ppf_2d_{metric_n}_e{epsilon}.csv'))

    # plot
    plt.figure(figsize=(10, 10))
    pvt = diff.pivot('gr', 'ir', '0')
    sns.heatmap(
        pvt,
        cmap='viridis',
        vmin=0, vmax=.2,
        square=True,
        xticklabels=[f'{x:.4f}' for x in pvt.columns],
        yticklabels=[f'{y:.4f}' for y in pvt.index],
    )

    # labels and layout
    plt.title(f'{metric_n}: probability of {"im" if epsilon != 0 else ""}perfect fairness (ε={epsilon})')
    plt.xlabel('IR')
    plt.ylabel('GR')
    plt.tight_layout()

    # invert y
    ax = plt.gca()
    ax.invert_yaxis()

    plt.savefig(path.join(plots_dir, f'aaa_heatmap_{metric_n}_e{epsilon}.png'))
    return plt.gcf()

In [None]:
for metric in diff_metrics.items():
    for epsilon in [0, .01, .02]:
        fig = plot(metric, epsilon)
        # plt.show()
        plt.close(fig)

In [None]:
for metric in ratio_metrics.items():
    for epsilon in [0, .01, .02]:
        fig = plot(metric, epsilon)
        # plt.show()
        plt.close(fig)