## Import libraries

In [7]:
import numpy as np
import pandas as pd
import pickle
import gc

## Define functions

In [2]:
#each group has the same probability of being classified with positive outcome
def get_statistical_parity(df):
    return ((df.j_tp + df.j_fp)/(df.j_tp + df.j_fp + df.j_tn + df.j_fn)) - ((df.i_tp + df.i_fp)/(df.i_tp + df.i_fp + df.i_tn + df.i_fn))


#similiar to statistical parity, but using ratio
def get_disparate_impact(df):
    return ((df.j_tp + df.j_fp)/(df.j_tp + df.j_fp + df.j_tn + df.j_fn)) / ((df.i_tp + df.i_fp)/(df.i_tp + df.i_fp + df.i_tn + df.i_fn))


#accuracy equality ratio
def get_acc_equality_ratio(df):
    return ((df.j_tp + df.j_tn)/(df.j_tp + df.j_fp + df.j_tn + df.j_fn)) / ((df.i_tp + df.i_tn)/(df.i_tp + df.i_fp + df.i_tn + df.i_fn))


#accuracy equality difference
def get_acc_equality_diff(df):
    return ((df.j_tp + df.j_tn)/(df.j_tp + df.j_fp + df.j_tn + df.j_fn)) - ((df.i_tp + df.i_tn)/(df.i_tp + df.i_fp + df.i_tn + df.i_fn))


#equal opportunity
#ratio of true positive rates
def get_equal_opp_ratio(j_tpr, i_tpr):
    return j_tpr / i_tpr


#equal opportunity
#difference of true positive rates
def get_equal_opp_diff(j_tpr, i_tpr):
    return j_tpr - i_tpr


#predictive equality ratio
def get_pred_equality_ratio(j_fpr, i_fpr):
    return j_fpr / i_fpr


#predictive equality difference - false positive rate
def get_pred_equality_diff(j_fpr, i_fpr):
    return j_fpr - i_fpr


#positive predictive parity ratio
def get_pred_parity_ratio(j_ppv, i_ppv):
    return j_ppv / i_ppv


#positive predictive parity difference
def get_pos_pred_parity_diff(j_ppv, i_ppv):
    return j_ppv - i_ppv


#negative predictive parity difference
def get_neg_pred_parity_ratio(j_npv, i_npv):
    return j_npv / i_npv


#negative predictive parity difference
def get_neg_pred_parity_diff(j_npv, i_npv):
    return j_npv - i_npv

## 1st half of the dataset

##### (assumed that dataset and previous calculations are already in the same directory as this file)

In [3]:
dataCols = ['i_tp', 'i_fp', 'i_tn', 'i_fn', 'j_tp', 'j_fp', 'j_tn', 'j_fn']

In [4]:
with open('Set(08,56).bin', "rb") as f:
    df = pd.DataFrame(pickle.load(f), columns = dataCols)

halfIdx = int(df.shape[0]/2)
df = df.iloc[:halfIdx]
df.head()

Unnamed: 0,i_tp,i_fp,i_tn,i_fn,j_tp,j_fp,j_tn,j_fn
0,56,0,0,0,0,0,0,0
1,55,1,0,0,0,0,0,0
2,55,0,1,0,0,0,0,0
3,55,0,0,1,0,0,0,0
4,55,0,0,0,1,0,0,0


In [6]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276635335 entries, 0 to 276635334
Data columns (total 8 columns):
 #   Column  Dtype
---  ------  -----
 0   i_tp    int8 
 1   i_fp    int8 
 2   i_tn    int8 
 3   i_fn    int8 
 4   j_tp    int8 
 5   j_fp    int8 
 6   j_tn    int8 
 7   j_fn    int8 
dtypes: int8(8)
memory usage: 2.1 GB


In [7]:
with open("stat_parity.bin", "wb+") as f:
    get_statistical_parity(df).to_numpy().tofile(f)

with open("disp_impact.bin", "wb+") as f:
    get_disparate_impact(df).to_numpy().tofile(f)

with open("acc_equality_ratio.bin", "wb+") as f:
    get_acc_equality_ratio(df).to_numpy().tofile(f)

with open("acc_equality_diff.bin", "wb+") as f:
    get_acc_equality_diff(df).to_numpy().tofile(f)

In [None]:
# Free the memory
del df
gc.collect()
gc.get_stats()

## 2nd half of the dataset

In [9]:
with open("Set(08,56).bin", "rb") as f:
    df = pd.DataFrame(pickle.load(f), columns = dataCols)

halfIdx = int(df.shape[0]/2)
df = df.iloc[halfIdx:]

In [11]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276635336 entries, 276635335 to 553270670
Data columns (total 8 columns):
 #   Column  Dtype
---  ------  -----
 0   i_tp    int8 
 1   i_fp    int8 
 2   i_tn    int8 
 3   i_fn    int8 
 4   j_tp    int8 
 5   j_fp    int8 
 6   j_tn    int8 
 7   j_fn    int8 
dtypes: int8(8)
memory usage: 2.1 GB


In [12]:
with open("stat_parity.bin", "ab+") as f:
    get_statistical_parity(df).to_numpy().tofile(f)

with open("disp_impact.bin", "ab+") as f:
    get_disparate_impact(df).to_numpy().tofile(f)

with open("acc_equality_ratio.bin", "ab+") as f:
    get_acc_equality_ratio(df).to_numpy().tofile(f)

with open("acc_equality_diff.bin", "ab+") as f:
    get_acc_equality_diff(df).to_numpy().tofile(f)

In [None]:
# Free the memory
del df
gc.collect()
gc.get_stats()

## Additional calculations based on previous ones

##### (here we read not as float16, because it causes errors when we write calculations; we can cast it to 64 before writing, but it better read as 64 from the start)

In [4]:
with open("j_tpr.bin", "rb") as f:
    j_tpr = pd.DataFrame(np.fromfile(f), columns = ["j_tpr"])

In [5]:
with open("i_tpr.bin", "rb") as f:
    i_tpr = pd.DataFrame(np.fromfile(f), columns = ["i_tpr"])

In [8]:
with open("equal_opp_ratio.bin", "wb+") as f:
    get_equal_opp_ratio(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)
    
with open("equal_opp_diff.bin", "wb+") as f:
    get_equal_opp_diff(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)

In [9]:
del j_tpr
del i_tpr
gc.collect()
gc.get_stats()

[{'collections': 311, 'collected': 2771, 'uncollectable': 0},
 {'collections': 28, 'collected': 537, 'uncollectable': 0},
 {'collections': 3, 'collected': 54, 'uncollectable': 0}]

In [None]:
with open("j_fpr.bin", "rb") as f:
    j_fpr = pd.DataFrame(np.fromfile(f), columns = ["j_fpr"])

In [11]:
with open("i_fpr.bin", "rb") as f:
    i_fpr = pd.DataFrame(np.fromfile(f), columns = ["i_fpr"])

In [14]:
with open("pred_equality_ratio.bin", "wb+") as f:
    get_pred_equality_ratio(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
    
with open("pred_equality_diff.bin", "wb+") as f:
    get_pred_equality_diff(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)

In [15]:
del j_fpr
del i_fpr
gc.collect()
gc.get_stats()

[{'collections': 311, 'collected': 2771, 'uncollectable': 0},
 {'collections': 28, 'collected': 537, 'uncollectable': 0},
 {'collections': 4, 'collected': 54, 'uncollectable': 0}]

In [16]:
with open("j_ppv.bin", "rb") as f:
    j_ppv = pd.DataFrame(np.fromfile(f), columns = ["j_ppv"])

In [17]:
with open("i_ppv.bin", "rb") as f:
    i_ppv = pd.DataFrame(np.fromfile(f), columns = ["i_ppv"])

In [20]:
with open("pred_parity_ratio.bin", "wb+") as f:
    get_pred_parity_ratio(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

with open("pos_pred_parity_diff.bin", "wb+") as f:
    get_pos_pred_parity_diff(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

In [21]:
del j_ppv
del i_ppv
gc.collect()
gc.get_stats()

[{'collections': 311, 'collected': 2771, 'uncollectable': 0},
 {'collections': 28, 'collected': 537, 'uncollectable': 0},
 {'collections': 5, 'collected': 54, 'uncollectable': 0}]

In [3]:
with open("j_npv.bin", "rb") as f:
    j_npv = pd.DataFrame(np.fromfile(f), columns = ["j_npv"])

In [4]:
with open("i_npv.bin", "rb") as f:
    i_npv = pd.DataFrame(np.fromfile(f), columns = ["i_npv"])

In [5]:
with open("neg_pred_parity_ratio.bin", "wb+") as f:
    get_neg_pred_parity_ratio(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

with open("neg_pred_parity_diff.bin", "wb+") as f:
    get_neg_pred_parity_diff(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

In [6]:
del j_npv
del i_npv
gc.collect()
gc.get_stats()

[{'collections': 310, 'collected': 2771, 'uncollectable': 0},
 {'collections': 28, 'collected': 537, 'uncollectable': 0},
 {'collections': 3, 'collected': 54, 'uncollectable': 0}]