# Shared calculations for histograms and perfect fairness, for all metrics

## Import libraries and define variables

In [1]:
import numpy as np
import pandas as pd
import pickle
import gc
import glob
import os
from os import path
from utils import *     # metric functions

In [2]:
data_cols = [
    'i_tp',     # minority true positive
    'i_fp',     # minority false positive
    'i_tn',     # minority true negative
    'i_fn',     # minority false negative
    'j_tp',     # majority true positive
    'j_fp',     # majority false positive
    'j_tn',     # majority true negative
    'j_fn',     # majority false negative
]

sample_size = 24        # 56

calculations_dir = path.join('out', 'calculations', f'n{sample_size}')
os.makedirs(calculations_dir, exist_ok=True)
dataset_path = path.join('..', 'fairness-data-generator', 'out', f'Set(08,{sample_size}).bin')

# Get calculations
As the dataset is quite large (4.2 Gb) we will write calculations to separate files in 2 stages.

## Write calculations of the 1st half of the dataset

In [5]:
# Get half of the data
with open(dataset_path, "rb") as f:
    df = pd.DataFrame(pickle.load(f), columns=data_cols)

halfIdx = int(df.shape[0] / 2)
df = df.iloc[:halfIdx]
df.head()

Unnamed: 0,i_tp,i_fp,i_tn,i_fn,j_tp,j_fp,j_tn,j_fn
0,24,0,0,0,0,0,0,0
1,23,1,0,0,0,0,0,0
2,23,0,1,0,0,0,0,0
3,23,0,0,1,0,0,0,0
4,23,0,0,0,1,0,0,0


In [6]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1314787 entries, 0 to 1314786
Data columns (total 8 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   i_tp    1314787 non-null  int8 
 1   i_fp    1314787 non-null  int8 
 2   i_tn    1314787 non-null  int8 
 3   i_fn    1314787 non-null  int8 
 4   j_tp    1314787 non-null  int8 
 5   j_fp    1314787 non-null  int8 
 6   j_tn    1314787 non-null  int8 
 7   j_fn    1314787 non-null  int8 
dtypes: int8(8)
memory usage: 10.0 MB


In [7]:
# Calculate half of GRs
with open(path.join(calculations_dir,  "gr.bin"), "wb+") as f:
    get_group_ratios(df).to_numpy().tofile(f)

# Calculate half of IRs
with open(path.join(calculations_dir,  "ir.bin"), "wb+") as f:
    get_imbalance_ratios(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "i_tpr.bin"), "wb+") as f:
    getTPR_i(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "j_tpr.bin"), "wb+") as f:
    getTPR_j(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "i_fpr.bin"), "wb+") as f:
    getFPR_i(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "j_fpr.bin"), "wb+") as f:
    getFPR_j(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "i_ppv.bin"), "wb+") as f:
    get_positive_predictive_value_i(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "j_ppv.bin"), "wb+") as f:
    get_positive_predictive_value_j(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "i_npv.bin"), "wb+") as f:
    get_negative_predictive_value_i(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "j_npv.bin"), "wb+") as f:
    get_negative_predictive_value_j(df).to_numpy().tofile(f)
    
with open(path.join(calculations_dir,  "stat_parity.bin"), "wb+") as f:
    get_statistical_parity(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "disp_impact.bin"), "wb+") as f:
    get_disparate_impact(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "acc_equality_ratio.bin"), "wb+") as f:
    get_acc_equality_ratio(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "acc_equality_diff.bin"), "wb+") as f:
    get_acc_equality_diff(df).to_numpy().tofile(f)
    
# Free the memory
del df
gc.collect()
gc.get_stats()

[{'collections': 418, 'collected': 2146, 'uncollectable': 0},
 {'collections': 38, 'collected': 1113, 'uncollectable': 0},
 {'collections': 4, 'collected': 575, 'uncollectable': 0}]

## Append calculations of the 2st half of the dataset

In [8]:
with open(dataset_path, "rb") as f:
    df = pd.DataFrame(pickle.load(f), columns=data_cols)

halfIdx = int(df.shape[0] / 2)
df = df.iloc[halfIdx:]

In [9]:
with open(path.join(calculations_dir,  "gr.bin"), "ab+") as f:
    get_group_ratios(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "ir.bin"), "ab+") as f:
    get_imbalance_ratios(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "i_tpr.bin"), "ab+") as f:
    getTPR_i(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "j_tpr.bin"), "ab+") as f:
    getTPR_j(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "i_fpr.bin"), "ab+") as f:
    getFPR_i(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "j_fpr.bin"), "ab+") as f:
    getFPR_j(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "i_ppv.bin"), "ab+") as f:
    get_positive_predictive_value_i(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "j_ppv.bin"), "ab+") as f:
    get_positive_predictive_value_j(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "i_npv.bin"), "ab+") as f:
    get_negative_predictive_value_i(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "j_npv.bin"), "ab+") as f:
    get_negative_predictive_value_j(df).to_numpy().tofile(f)
    
with open(path.join(calculations_dir,  "stat_parity.bin"), "ab+") as f:
    get_statistical_parity(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "disp_impact.bin"), "ab+") as f:
    get_disparate_impact(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "acc_equality_ratio.bin"), "ab+") as f:
    get_acc_equality_ratio(df).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "acc_equality_diff.bin"), "ab+") as f:
    get_acc_equality_diff(df).to_numpy().tofile(f)
    
del df
gc.collect()
gc.get_stats()

[{'collections': 421, 'collected': 2146, 'uncollectable': 0},
 {'collections': 38, 'collected': 1113, 'uncollectable': 0},
 {'collections': 5, 'collected': 575, 'uncollectable': 0}]

In [10]:
!ls -lah calculations

'ls' is not recognized as an internal or external command,
operable program or batch file.


The files are written as `float64`, while the dataset has data-type `int8`, thus the size of each file is the same as the size of the dataset because each of them contains one 8 times heavier column.

# Get additional calculations
These calculations will be based on the previous ones.

## Write 1st part
Here the story is even worse as we need to open 2 files of the same size, so we will do it the same way: in 2 stages.

In [11]:
with open(path.join(calculations_dir,  "i_tpr.bin"), "rb") as f:
    i_tpr = pd.DataFrame(np.fromfile(f), columns=["i_tpr"])
    halfIdx = int(i_tpr.shape[0] / 2)
    i_tpr = i_tpr.iloc[:halfIdx]

with open(path.join(calculations_dir,  "j_tpr.bin"), "rb") as f:
    j_tpr = pd.DataFrame(np.fromfile(f), columns=["j_tpr"])
    halfIdx = int(j_tpr.shape[0] / 2)
    j_tpr = j_tpr.iloc[:halfIdx]
    
with open(path.join(calculations_dir,  "equal_opp_ratio.bin"), "wb+") as f:
    get_equal_opp_ratio(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)
    
with open(path.join(calculations_dir,  "equal_opp_diff.bin"), "wb+") as f:
    get_equal_opp_diff(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)

del j_tpr
del i_tpr
gc.collect()

0

In [12]:
with open(path.join(calculations_dir,  "i_fpr.bin"), "rb") as f:
    i_fpr = pd.DataFrame(np.fromfile(f), columns=["i_fpr"])
    halfIdx = int(i_fpr.shape[0] / 2)
    i_fpr = i_fpr.iloc[:halfIdx]

with open(path.join(calculations_dir,  "j_fpr.bin"), "rb") as f:
    j_fpr = pd.DataFrame(np.fromfile(f), columns=["j_fpr"])
    halfIdx = int(j_fpr.shape[0] / 2)
    j_fpr = j_fpr.iloc[:halfIdx]

with open(path.join(calculations_dir,  "pred_equality_ratio.bin"), "wb+") as f:
    get_pred_equality_ratio(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
    
with open(path.join(calculations_dir,  "pred_equality_diff.bin"), "wb+") as f:
    get_pred_equality_diff(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
    
del j_fpr
del i_fpr
gc.collect()

0

In [13]:
with open(path.join(calculations_dir,  "i_ppv.bin"), "rb") as f:
    i_ppv = pd.DataFrame(np.fromfile(f), columns=["i_ppv"])
    halfIdx = int(i_ppv.shape[0] / 2)
    i_ppv = i_ppv.iloc[:halfIdx]
    
with open(path.join(calculations_dir,  "j_ppv.bin"), "rb") as f:
    j_ppv = pd.DataFrame(np.fromfile(f), columns=["j_ppv"])
    halfIdx = int(j_ppv.shape[0] / 2)
    j_ppv = j_ppv.iloc[:halfIdx]

with open(path.join(calculations_dir,  "pred_parity_ratio.bin"), "wb+") as f:
    get_pred_parity_ratio(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "pos_pred_parity_diff.bin"), "wb+") as f:
    get_pos_pred_parity_diff(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

del j_ppv
del i_ppv
gc.collect()

0

In [14]:
with open(path.join(calculations_dir,  "i_npv.bin"), "rb") as f:
    i_npv = pd.DataFrame(np.fromfile(f), columns=["i_npv"])
    halfIdx = int(i_npv.shape[0] / 2)
    i_npv = i_npv.iloc[:halfIdx]

with open(path.join(calculations_dir,  "j_npv.bin"), "rb") as f:
    j_npv = pd.DataFrame(np.fromfile(f), columns=["j_npv"])
    halfIdx = int(j_npv.shape[0] / 2)
    j_npv = j_npv.iloc[:halfIdx]

with open(path.join(calculations_dir,  "neg_pred_parity_ratio.bin"), "wb+") as f:
    get_neg_pred_parity_ratio(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "neg_pred_parity_diff.bin"), "wb+") as f:
    get_neg_pred_parity_diff(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

del j_npv
del i_npv
gc.collect()

0

## Append 2nd part

In [15]:
with open(path.join(calculations_dir,  "i_tpr.bin"), "rb") as f:
    i_tpr = pd.DataFrame(np.fromfile(f), columns=["i_tpr"])
    halfIdx = int(i_tpr.shape[0] / 2)
    i_tpr = i_tpr.iloc[halfIdx:]

with open(path.join(calculations_dir,  "j_tpr.bin"), "rb") as f:
    j_tpr = pd.DataFrame(np.fromfile(f), columns=["j_tpr"])
    halfIdx = int(j_tpr.shape[0] / 2)
    j_tpr = j_tpr.iloc[halfIdx:]
    
with open(path.join(calculations_dir,  "equal_opp_ratio.bin"), "ab+") as f:
    get_equal_opp_ratio(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)
    
with open(path.join(calculations_dir,  "equal_opp_diff.bin"), "ab+") as f:
    get_equal_opp_diff(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)

del j_tpr
del i_tpr
gc.collect()

0

In [16]:
with open(path.join(calculations_dir,  "i_fpr.bin"), "rb") as f:
    i_fpr = pd.DataFrame(np.fromfile(f), columns=["i_fpr"])
    halfIdx = int(i_fpr.shape[0] / 2)
    i_fpr = i_fpr.iloc[halfIdx:]

with open(path.join(calculations_dir,  "j_fpr.bin"), "rb") as f:
    j_fpr = pd.DataFrame(np.fromfile(f), columns=["j_fpr"])
    halfIdx = int(j_fpr.shape[0] / 2)
    j_fpr = j_fpr.iloc[halfIdx:]

with open(path.join(calculations_dir,  "pred_equality_ratio.bin"), "ab+") as f:
    get_pred_equality_ratio(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
    
with open(path.join(calculations_dir,  "pred_equality_diff.bin"), "ab+") as f:
    get_pred_equality_diff(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
    
del j_fpr
del i_fpr
gc.collect()

0

In [17]:
with open(path.join(calculations_dir,  "i_ppv.bin"), "rb") as f:
    i_ppv = pd.DataFrame(np.fromfile(f), columns=["i_ppv"])
    halfIdx = int(i_ppv.shape[0] / 2)
    i_ppv = i_ppv.iloc[halfIdx:]
    
with open(path.join(calculations_dir,  "j_ppv.bin"), "rb") as f:
    j_ppv = pd.DataFrame(np.fromfile(f), columns=["j_ppv"])
    halfIdx = int(j_ppv.shape[0] / 2)
    j_ppv = j_ppv.iloc[halfIdx:]

with open(path.join(calculations_dir,  "pred_parity_ratio.bin"), "ab+") as f:
    get_pred_parity_ratio(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "pos_pred_parity_diff.bin"), "ab+") as f:
    get_pos_pred_parity_diff(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

del j_ppv
del i_ppv
gc.collect()

0

In [18]:
with open(path.join(calculations_dir,  "i_npv.bin"), "rb") as f:
    i_npv = pd.DataFrame(np.fromfile(f), columns=["i_npv"])
    halfIdx = int(i_npv.shape[0] / 2)
    i_npv = i_npv.iloc[:halfIdx]

with open(path.join(calculations_dir,  "j_npv.bin"), "rb") as f:
    j_npv = pd.DataFrame(np.fromfile(f), columns=["j_npv"])
    halfIdx = int(j_npv.shape[0] / 2)
    j_npv = j_npv.iloc[:halfIdx]

with open(path.join(calculations_dir,  "neg_pred_parity_ratio.bin"), "ab+") as f:
    get_neg_pred_parity_ratio(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

with open(path.join(calculations_dir,  "neg_pred_parity_diff.bin"), "ab+") as f:
    get_neg_pred_parity_diff(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

del j_npv
del i_npv
gc.collect()

0

## Calculations for sonya plot

(extended in `ppf_calculations.ipynb`)

In [19]:
def calculate_sonya(df, ratio_type):
    diff_metrics = {
        'acc_equality_diff.bin': 'Accuracy equality difference', 
        'equal_opp_diff.bin': 'Equal opportunity difference', 
        'neg_pred_parity_diff.bin': 'Negative predictive parity difference', 
        'pos_pred_parity_diff.bin': 'Positive predictive parity difference', 
        'pred_equality_diff.bin': 'Predictive equality difference',
        'stat_parity.bin': 'Statistical parity'
    }
    
    diff_probs = {}
    compute_diff_prob = lambda df: np.sum(df['diff'] == 0) / len(df)
    
    for metricFName in diff_metrics:
        with open(path.join(calculations_dir,  metricFName), "rb") as f:
            diff_metric = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=["diff"])
        df = pd.concat([df, diff_metric], axis=1)

        diff = df.groupby(ratio_type).apply(compute_diff_prob)
        diff_probs[diff_metrics[metricFName]] = diff

        df.drop('diff', axis=1, inplace=True)
        
    sonya = pd.DataFrame(diff_probs)
    sonya.reset_index(inplace=True)
    sonya.to_csv(path.join(calculations_dir, f"ppf_{ratio_type}.csv"), index=False)

In [5]:
for ratio in ['gr', 'ir']:
    with open(path.join(calculations_dir, f"{ratio}.bin"), "rb") as f:
        df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns=[ratio])
    calculate_sonya(df, ratio)