# Probability of Perfect Fairness

calculations for different metrics, group ratios and imbalance ratios

In [1]:
import os
from os import path

import numpy as np
import pandas as pd

## Prepare data from all combinations

In [2]:
data_cols = [
    'i_tp',     # minority true positive
    'i_fp',     # minority false positive
    'i_tn',     # minority true negative
    'i_fn',     # minority false negative
    'j_tp',     # majority true positive
    'j_fp',     # majority false positive
    'j_tn',     # majority true negative
    'j_fn',     # majority false negative
]
sample_size = 56    # 24
epsilon = 1./50.

calculations_dir = path.join('out', 'calculations', f'n{sample_size}')
os.makedirs(calculations_dir, exist_ok=True)
dataset_path = path.join('..', 'fairness-data-generator', 'out', f'Set(08,{sample_size}).bin')

## Calculate values for visualizations

In [7]:
diff_metrics = {    # { file: metric name }
    'acc_equality_diff.bin': 'Accuracy equality difference',
    'equal_opp_diff.bin': 'Equal opportunity difference',
    'neg_pred_parity_diff.bin': 'Negative predictive parity difference',
    'pos_pred_parity_diff.bin': 'Positive predictive parity difference',
    'pred_equality_diff.bin': 'Predictive equality difference',
    'stat_parity.bin': 'Statistical parity'
}

In [8]:
def calculate_ppf_diff(df, metrics, ratio_type, epsilon=0):
    diff_probs = {}

    if epsilon == 0:
        compute_diff_prob = lambda df: np.sum(df['diff'] == 0) / len(df)
    else:
        compute_diff_prob = lambda df: np.sum(np.abs(df['diff']) < epsilon) / len(df)

    for metric_file, metric_name in metrics.items():
        with open(path.join(calculations_dir,  metric_file), 'rb') as f:
            diff_metric = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['diff'])
        df = pd.concat([df, diff_metric], axis=1)

        diff = df.groupby(ratio_type).apply(compute_diff_prob)
        diff_probs[metric_name] = diff

        df.drop('diff', axis=1, inplace=True)

    result = pd.DataFrame(diff_probs)
    result.reset_index(inplace=True)
    result.to_csv(path.join(calculations_dir, f'ppf_{ratio_type}_eps{epsilon}.csv'), index=False)

In [9]:
%%time

for ratio in ['gr', 'ir']:
    with open(path.join(calculations_dir, f'{ratio}.bin'), 'rb') as f:
        df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[ratio])
    calculate_ppf_diff(df, diff_metrics, ratio, epsilon)
del df

CPU times: user 7min 22s, sys: 3min 46s, total: 11min 8s
Wall time: 11min 51s


## Ratio metrics

In [3]:
ratio_metrics = {    # { file: metric name }
    'acc_equality_ratio.bin': 'Accuracy equality ratio',
    'disp_impact.bin': 'Disparate impact',
    'equal_opp_ratio.bin': 'Equal opportunity ratio',
    'neg_pred_parity_ratio.bin': 'Negative predictive parity ratio',
    'pos_pred_parity_ratio.bin': 'Positive predictive parity ratio',
    # 'pred_parity_ratio.bin': 'Positive predictive parity ratio',
    'pred_equality_ratio.bin': 'Predictive equality ratio',

}

In [4]:
def calculate_ppf_ratio(df, metrics, ratio_type, epsilon=0):
    probs = {}
    if epsilon == 0:
        compute_prob = lambda df: np.sum(df['r'] == 1) / len(df)
    else:
        compute_prob = lambda df: np.sum(np.abs(df['r'] - 1) < epsilon) / len(df)

    for metric_file, metric_name in metrics.items():
        with open(path.join(calculations_dir,  metric_file), 'rb') as f:
            metric_values = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=['r'])
        df = pd.concat([df, metric_values], axis=1)

        metric_probs = df.groupby(ratio_type).apply(compute_prob)
        probs[metric_name] = metric_probs

        df.drop('r', axis=1, inplace=True)

    result = pd.DataFrame(probs)
    result.reset_index(inplace=True)
    result.to_csv(path.join(calculations_dir, f'ppf_{ratio_type}_ratios_eps{epsilon}.csv'), index=False)

In [5]:
%%time

for epsilon in [0, .01, .02]:
    for ratio in ['gr', 'ir']:
        with open(path.join(calculations_dir, f'{ratio}.bin'), 'rb') as f:
            df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[ratio])
        calculate_ppf_ratio(df, ratio_metrics, ratio, epsilon)
        del df

CPU times: user 19min 3s, sys: 9min 5s, total: 28min 8s
Wall time: 29min 33s
