# Probability of Perfect Fairness

calculations for different metrics, group ratios and imbalance ratios

In [None]:
import os
from os import path

import numpy as np
import pandas as pd

## Prepare data from all combinations

In [None]:
sample_size = 56
# setting epsilon to another (small positive) value allows to calculate the probability of being epsilon-close to perfect fairness
epsilon = 0

calculations_dir = path.join('out', 'calculations', f'n{sample_size}')
os.makedirs(calculations_dir, exist_ok=True)
dataset_path = path.join('..', 'fairness-data-generator', 'out', f'Set(08,{sample_size}).bin')

## Calculate values for visualizations

In [None]:
diff_metrics = {    # { file: metric name }
    'pos_pred_parity_diff.bin': 'Positive predictive parity difference',
    'acc_equality_diff.bin': 'Accuracy equality difference',
    'stat_parity.bin': 'Statistical parity difference',
    'equal_opp_diff.bin': 'Equal opportunity difference',
    'neg_pred_parity_diff.bin': 'Negative predictive parity difference',
    'pred_equality_diff.bin': 'Predictive equality difference',
}

In [None]:
def calculate_ppf_diff(df, metrics, ratio_type, epsilon=0):
    pf_probs, nan_probs = {}, {}

    if epsilon == 0:
        compute_diff_prob = lambda df: np.sum(df['diff'] == 0) / len(df)
    else:
        compute_diff_prob = lambda df: np.sum(np.abs(df['diff']) < epsilon) / len(df)

    for metric_file, metric_name in metrics.items():
        print(metric_name)
        with open(path.join(calculations_dir,  metric_file), 'rb') as f:
            diff_metric = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['diff'])
        df = pd.concat([df, diff_metric], axis=1)

        pf_bygroup = list()
        nans_bygroup = list()

        for gn, group in df.groupby(ratio_type):
            if group['diff'].isna().all():
                pf_bygroup.append([gn, np.nan])
            else:
                pf_bygroup.append([gn, compute_diff_prob(group)])
            nans_bygroup.append([gn, group['diff'].isna().sum() / group.shape[0]])

        pf_bygroup = pd.DataFrame(pf_bygroup, columns=[ratio_type, 'diff'])
        pf_probs[metric_name] = pf_bygroup['diff']

        nans_bygroup = pd.DataFrame(nans_bygroup, columns=[ratio_type, 'diff'])
        nan_probs[metric_name] = nans_bygroup['diff']

        # the dataframe (first col) can be reused for the next metric
        df.drop('diff', axis=1, inplace=True)

    pf_probs[ratio_type] = pf_bygroup[ratio_type]
    pf_df = pd.DataFrame(pf_probs).reset_index()
    pf_df.to_csv(path.join(calculations_dir, f'perfect_fairness_{ratio_type}_eps{epsilon}.csv'), index=False)

    nan_probs[ratio_type] = nans_bygroup[ratio_type]
    nan_df = pd.DataFrame(nan_probs).reset_index()
    nan_df.to_csv(path.join(calculations_dir, f'nans_{ratio_type}.csv'), index=False)

In [None]:
%%time

for ratio in ['ir', 'gr']:
    with open(path.join(calculations_dir, f'{ratio}.bin'), 'rb') as f:
        df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[ratio])
    calculate_ppf_diff(df, diff_metrics, ratio, epsilon)
del df