# Shared calculations for histograms and perfect fairness, for all metrics

In this file, a slightly different naming convention is used: `i` for m _i_ nority and `j` for ma _j_ ority group.

## Setup

In [1]:
import gc
import os
import pickle
from os import path
from time import perf_counter, perf_counter_ns

import numpy as np
import pandas as pd

from utils import *  # metrics functions

In [2]:
data_cols = [
    'i_tp',     # minority true positive
    'i_fp',     # minority false positive
    'i_tn',     # minority true negative
    'i_fn',     # minority false negative
    'j_tp',     # majority true positive
    'j_fp',     # majority false positive
    'j_tn',     # majority true negative
    'j_fn',     # majority false negative
]

# TODO update to 56
sample_size = 24

calculations_dir = path.join('out', 'calculations', f'n{sample_size}')
timer_dir = path.join('out', 'time')
os.makedirs(calculations_dir, exist_ok=True)
os.makedirs(timer_dir, exist_ok=True)

# reset the (default) timer file
try:
    os.remove(path.join(timer_dir, 'metrics_calculations.csv'))
except FileNotFoundError:
    pass

dataset_path = path.join('out', f'Set(08,{sample_size}).bin')

In [3]:
with open(dataset_path, "rb") as f:
    df = pd.DataFrame(pickle.load(f), columns=data_cols)

df.head()

# Part 1: ratios and basic metrics

For each row, its group and imbalance ratio is calculated. Then, metrics that derive directly from these ratios are calculated.

In [4]:
def timer_checkpoint(label: str):
    timer.append((
        label,
        perf_counter()
    ))

def timer_print():
    t0 = timer[0][1]
    for l, t in timer[1:]:
        print(f'{l}:\t{t - t0}')

def timer_save(fn='metrics_calculations.csv'):
    with open(path.join(timer_dir, fn), 'a+') as f:
        t0 = timer[0][1]
        for l, t in timer[1:]:
            f.write(f'{l}, {t - t0}\n')
        f.write('\n')

In [5]:
timer = [('t0', perf_counter())]
# t0 = perf_counter()

# Calculate group ratios
with open(path.join(calculations_dir, "gr.bin"), "wb+") as f:
    get_group_ratios(df).to_numpy().tofile(f)
timer_checkpoint('Calculate group ratios')

# Calculate imbalance ratios
with open(path.join(calculations_dir, "ir.bin"), "wb+") as f:
    get_imbalance_ratios(df).to_numpy().tofile(f)
timer_checkpoint('Calculate imbalance ratios')

# calculate metrics
with open(path.join(calculations_dir, "i_tpr.bin"), "wb+") as f:
    getTPR_i(df).to_numpy().tofile(f)
timer_checkpoint('getTPR_i')

with open(path.join(calculations_dir, "j_tpr.bin"), "wb+") as f:
    getTPR_j(df).to_numpy().tofile(f)
timer_checkpoint('getTPR_j')

with open(path.join(calculations_dir, "i_fpr.bin"), "wb+") as f:
    getFPR_i(df).to_numpy().tofile(f)
timer_checkpoint('getFPR_i')

with open(path.join(calculations_dir, "j_fpr.bin"), "wb+") as f:
    getFPR_j(df).to_numpy().tofile(f)
timer_checkpoint('getFPR_j')

with open(path.join(calculations_dir, "i_ppv.bin"), "wb+") as f:
    get_positive_predictive_value_i(df).to_numpy().tofile(f)
timer_checkpoint('get_positive_predictive_value_i')

with open(path.join(calculations_dir, "j_ppv.bin"), "wb+") as f:
    get_positive_predictive_value_j(df).to_numpy().tofile(f)
timer_checkpoint('get_positive_predictive_value_j')

with open(path.join(calculations_dir, "i_npv.bin"), "wb+") as f:
    get_negative_predictive_value_i(df).to_numpy().tofile(f)
timer_checkpoint('get_negative_predictive_value_i')

with open(path.join(calculations_dir, "j_npv.bin"), "wb+") as f:
    get_negative_predictive_value_j(df).to_numpy().tofile(f)
timer_checkpoint('get_negative_predictive_value_j')
    
with open(path.join(calculations_dir, "stat_parity.bin"), "wb+") as f:
    get_statistical_parity(df).to_numpy().tofile(f)
timer_checkpoint('get_statistical_parity')

# with open(path.join(calculations_dir, "disp_impact.bin"), "wb+") as f:
#     get_disparate_impact(df).to_numpy().tofile(f)
# timer_checkpoint('get_disparate_impact')

# with open(path.join(calculations_dir, "acc_equality_ratio.bin"), "wb+") as f:
#     get_acc_equality_ratio(df).to_numpy().tofile(f)
# timer_checkpoint('get_acc_equality_ratio')

with open(path.join(calculations_dir, "acc_equality_diff.bin"), "wb+") as f:
    get_acc_equality_diff(df).to_numpy().tofile(f)
timer_checkpoint('get_acc_equality_diff')

timer_print()
timer_save()

# Free the memory
# del df
# gc.collect()
# gc.get_stats()

Calculate group ratios:	0.08875179500319064
Calculate imbalance ratios:	0.16748862899839878
getTPR_i:	0.20173623994924128
getTPR_j:	0.2356706679565832
getFPR_i:	0.2713941929396242
getFPR_j:	0.30812228994909674
get_positive_predictive_value_i:	0.34176465997006744
get_positive_predictive_value_j:	0.3753490230301395
get_negative_predictive_value_i:	0.4091764659387991
get_negative_predictive_value_j:	0.44317842891905457
get_statistical_parity:	0.5310801429441199
get_acc_equality_diff:	0.6276743049966171


# Part 2: Get additional calculations

Calculations that are based on the previous ones. Some files from the previous part are used here, and new ones are created.

In [6]:
timer = [('t0', perf_counter())]

with open(path.join(calculations_dir, "i_tpr.bin"), "rb") as f:
    i_tpr = pd.DataFrame(np.fromfile(f), columns=["i_tpr"])
timer_checkpoint('i_tpr')

with open(path.join(calculations_dir, "j_tpr.bin"), "rb") as f:
    j_tpr = pd.DataFrame(np.fromfile(f), columns=["j_tpr"])
timer_checkpoint('j_tpr')

# with open(path.join(calculations_dir, "equal_opp_ratio.bin"), "wb+") as f:
#     get_equal_opp_ratio(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)
# timer_checkpoint('get_equal_opp_ratio')
    
with open(path.join(calculations_dir, "equal_opp_diff.bin"), "wb+") as f:
    get_equal_opp_diff(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)
timer_checkpoint('get_equal_opp_diff')

timer_print()
timer_save()

del j_tpr
del i_tpr
gc.collect()

i_tpr:	0.012874041101895273
j_tpr:	0.02495126600842923
get_equal_opp_diff:	0.0607953310245648


7

In [7]:
timer = [('t0', perf_counter())]

with open(path.join(calculations_dir, "i_fpr.bin"), "rb") as f:
    i_fpr = pd.DataFrame(np.fromfile(f), columns=["i_fpr"])
timer_checkpoint('i_fpr')

with open(path.join(calculations_dir, "j_fpr.bin"), "rb") as f:
    j_fpr = pd.DataFrame(np.fromfile(f), columns=["j_fpr"])
timer_checkpoint('j_fpr')

# with open(path.join(calculations_dir, "pred_equality_ratio.bin"), "wb+") as f:
#     get_pred_equality_ratio(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)

with open(path.join(calculations_dir, "pred_equality_diff.bin"), "wb+") as f:
    get_pred_equality_diff(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
timer_checkpoint('get_pred_equality_diff')

timer_print()
timer_save()

del j_fpr
del i_fpr
gc.collect()

i_fpr:	0.007984917028807104
j_fpr:	0.020318961003795266
get_pred_equality_diff:	0.05109178100246936


0

In [8]:
with open(path.join(calculations_dir, "i_ppv.bin"), "rb") as f:
    i_ppv = pd.DataFrame(np.fromfile(f), columns=["i_ppv"])
timer_checkpoint('i_ppv')

with open(path.join(calculations_dir, "j_ppv.bin"), "rb") as f:
    j_ppv = pd.DataFrame(np.fromfile(f), columns=["j_ppv"])
timer_checkpoint('j_ppv')

# with open(path.join(calculations_dir, "pos_pred_parity_ratio.bin"), "wb+") as f:
#     get_pos_pred_parity_ratio(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

with open(path.join(calculations_dir, "pos_pred_parity_diff.bin"), "wb+") as f:
    get_pos_pred_parity_diff(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)
timer_checkpoint('get_pos_pred_parity_diff')

timer_print()
timer_save()

del j_ppv
del i_ppv
gc.collect()

i_fpr:	0.007984917028807104
j_fpr:	0.020318961003795266
get_pred_equality_diff:	0.05109178100246936
i_ppv:	0.12837609101552516
j_ppv:	0.14048844808712602
get_pos_pred_parity_diff:	0.17137813405133784


0

In [9]:
with open(path.join(calculations_dir, "i_npv.bin"), "rb") as f:
    i_npv = pd.DataFrame(np.fromfile(f), columns=["i_npv"])
timer_checkpoint('i_npv')

with open(path.join(calculations_dir, "j_npv.bin"), "rb") as f:
    j_npv = pd.DataFrame(np.fromfile(f), columns=["j_npv"])
timer_checkpoint('j_npv')

with open(path.join(calculations_dir, "neg_pred_parity_ratio.bin"), "wb+") as f:
    get_neg_pred_parity_ratio(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)
timer_checkpoint('get_neg_pred_parity_ratio')

with open(path.join(calculations_dir, "neg_pred_parity_diff.bin"), "wb+") as f:
    get_neg_pred_parity_diff(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)
timer_checkpoint('get_neg_pred_parity_diff')

timer_print()
timer_save()

del j_npv
del i_npv
gc.collect()

i_fpr:	0.007984917028807104
j_fpr:	0.020318961003795266
get_pred_equality_diff:	0.05109178100246936
i_ppv:	0.12837609101552516
j_ppv:	0.14048844808712602
get_pos_pred_parity_diff:	0.17137813405133784
i_npv:	0.24896213004831225
j_npv:	0.2614579671062529
get_neg_pred_parity_ratio:	0.2941047390922904
get_neg_pred_parity_diff:	0.31933507102075964


7