# Probability of Perfect Fairness

calculations for different metrics, group ratios and imbalance ratios

In [None]:
import os
from os import path

import numpy as np
import pandas as pd

## Prepare data from all combinations

In [None]:
sample_size = 56    # 24, 56
# setting epsilon to another (small positive) value allows to calculate the probability of being epsilon-close to perfect fairness
epsilon = 0

calculations_dir = path.join('out', 'calculations', f'n{sample_size}')
os.makedirs(calculations_dir, exist_ok=True)
dataset_path = path.join('..', 'fairness-data-generator', 'out', f'Set(08,{sample_size}).bin')

## Calculate values for visualizations

In [None]:
diff_metrics = {    # { file: metric name }
    'pos_pred_parity_diff.bin': 'Positive predictive parity difference',
    'acc_equality_diff.bin': 'Accuracy equality difference',
    'stat_parity.bin': 'Statistical parity difference',
    'equal_opp_diff.bin': 'Equal opportunity difference',
    'neg_pred_parity_diff.bin': 'Negative predictive parity difference',
    'pred_equality_diff.bin': 'Predictive equality difference',
}

In [None]:
def calculate_ppf_diff(df, metrics, ratio_type, epsilon=0):
    pf_probs, nan_probs = {}, {}

    if epsilon == 0:
        compute_diff_prob = lambda df: np.sum(df['diff'] == 0) / len(df)
    else:
        compute_diff_prob = lambda df: np.sum(np.abs(df['diff']) < epsilon) / len(df)

    for metric_file, metric_name in metrics.items():
        print(metric_name)
        with open(path.join(calculations_dir,  metric_file), 'rb') as f:
            diff_metric = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['diff'])
        df = pd.concat([df, diff_metric], axis=1)

        pf_bygroup = list()
        nans_bygroup = list()

        for gn, group in df.groupby(ratio_type):
            if group['diff'].isna().all():
                pf_bygroup.append([gn, np.nan])
            else:
                pf_bygroup.append([gn, compute_diff_prob(group)])
            nans_bygroup.append([gn, group['diff'].isna().sum() / group.shape[0]])

        pf_bygroup = pd.DataFrame(pf_bygroup, columns=[ratio_type, 'diff'])
        pf_probs[metric_name] = pf_bygroup['diff']

        nans_bygroup = pd.DataFrame(nans_bygroup, columns=[ratio_type, 'diff'])
        nan_probs[metric_name] = nans_bygroup['diff']

        # the dataframe (first col) can be reused for the next metric
        df.drop('diff', axis=1, inplace=True)

    pf_probs[ratio_type] = pf_bygroup[ratio_type]
    pf_df = pd.DataFrame(pf_probs).reset_index()
    pf_df.to_csv(path.join(calculations_dir, f'perfect_fairness_{ratio_type}_eps{epsilon}.csv'), index=False)

    nan_probs[ratio_type] = nans_bygroup[ratio_type]
    nan_df = pd.DataFrame(nan_probs).reset_index()
    nan_df.to_csv(path.join(calculations_dir, f'nans_{ratio_type}.csv'), index=False)

In [None]:
%%time

for ratio in ['ir', 'gr']:
    with open(path.join(calculations_dir, f'{ratio}.bin'), 'rb') as f:
        df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[ratio])
    calculate_ppf_diff(df, diff_metrics, ratio, epsilon)
del df

## Ratio metrics

not considered in the paper

In [None]:
ratio_metrics = {    # { file: metric name }
    'acc_equality_ratio.bin': 'Accuracy equality ratio',
    'disp_impact.bin': 'Disparate impact',
    'equal_opp_ratio.bin': 'Equal opportunity ratio',
    'neg_pred_parity_ratio.bin': 'Negative predictive parity ratio',
    'pos_pred_parity_ratio.bin': 'Positive predictive parity ratio',
    'pred_equality_ratio.bin': 'Predictive equality ratio',
}

In [None]:
def calculate_ppf_ratio(df, metrics, ratio_type, epsilon=0):
    probs = {}
    if epsilon == 0:
        compute_prob = lambda df: np.sum(df['r'] == 1) / len(df)
    else:
        compute_prob = lambda df: np.sum(np.abs(df['r'] - 1) < epsilon) / len(df)

    for metric_file, metric_name in metrics.items():
        with open(path.join(calculations_dir,  metric_file), 'rb') as f:
            metric_values = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['r'])
        df = pd.concat([df, metric_values], axis=1)

        metric_probs = df.groupby(ratio_type).apply(compute_prob)
        probs[metric_name] = metric_probs

        df.drop('r', axis=1, inplace=True)

    result = pd.DataFrame(probs)
    result.reset_index(inplace=True)
    result.to_csv(path.join(calculations_dir, f'ppf_{ratio_type}_ratios_eps{epsilon}.csv'), index=False)

In [None]:
%%time

for epsilon in [0, .01, .02]:
    for ratio in ['gr', 'ir']:
        with open(path.join(calculations_dir, f'{ratio}.bin'), 'rb') as f:
            df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[ratio])
        calculate_ppf_ratio(df, ratio_metrics, ratio, epsilon)
        del df

# Exploring the results

In [None]:
# checking why extremes for PPPD/NPPD are so high (prob ~.8)

epsilon = 0
for ratio in ['gr', 'ir']:
    with open(path.join(calculations_dir, f'{ratio}.bin'), 'rb') as f:
        df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[ratio])

    for metric_file, metric_name in {'pos_pred_parity_ratio.bin': 'Positive predictive parity ratio'}.items():
        with open(path.join(calculations_dir,  metric_file), 'rb') as f:
            metric_values = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[metric_name])
        df = pd.concat([df, metric_values], axis=1)

        x0 = df[df[ratio] == 0]

    del df

x0

In [None]:
np.isnan(x0['Positive predictive parity ratio']).sum() / len(x0)

In [None]:
np.sum(x0['Positive predictive parity ratio'] == 0)