# Fairness Metrics Coding Exercise

Download the Diabetes [train](https://evijit.io/materials/diabetes_train.csv) and [test](https://evijit.io/materials/diabetes_test.csv) csvs and place it in the working folder.

In [None]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Column names of the dataset

In [5]:
columns =['race', 'sex', 'age', 'admissiontypeid', 'dischargedispositionid',
'admissionsourceid', 'timeinhospital', 'numlabprocedures',
'numprocedures', 'nummedications', 'numberoutpatient',
'numberemergency', 'numberinpatient', 'diag1', 'diag2', 'diag3',
'numberdiagnoses', 'maxgluserum', 'A1Cresult', 'metformin',
'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
'rosiglitazone', 'insulin', 'change', 'diabetesMed', 'readmitted']

In [6]:
train_df = pd.read_csv('diabetes_train.csv',header=None)
train_df.columns = columns
# train_df.index = train_df['sex']
train_df = train_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

test_df = pd.read_csv('diabetes_test.csv',header=None)
test_df.columns = columns
# test_df.index = test_df['sex']
test_df = test_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [7]:
train_df.head()

Unnamed: 0,race,sex,age,admissiontypeid,dischargedispositionid,admissionsourceid,timeinhospital,numlabprocedures,numprocedures,nummedications,...,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,Caucasian,Female,75,2,1,2,2,4,0,9,...,0,0,2,0,0,0,0,0,1,0
1,Caucasian,Male,85,1,3,7,4,43,2,12,...,0,0,0,0,0,0,0,0,0,0
2,Caucasian,Female,65,1,18,7,8,63,5,17,...,0,0,2,0,0,0,0,0,1,1
3,AfricanAmerican,Female,55,6,1,7,4,38,0,8,...,0,0,0,2,0,0,1,1,1,0
4,Caucasian,Male,45,1,1,7,1,30,3,7,...,0,0,0,0,0,0,0,0,0,1


### Converting dataset to One-Hot vectors so that we can train a model

In [8]:
X_train = train_df.drop(['readmitted'],axis=1)
X_test = test_df.drop(['readmitted'],axis=1)

y_train = pd.Series(train_df['readmitted'])
y_test = pd.Series(test_df['readmitted'])

y_train = pd.Series(y_train.factorize(sort=True)[0])
y_test = pd.Series(y_test.factorize(sort=True)[0])

X_merged = pd.concat([X_train,X_test])

ohe = make_column_transformer(
    (OneHotEncoder(sparse=False), X_merged.dtypes == 'object'),
    remainder='passthrough', verbose_feature_names_out=False)

X_merged_temp  = pd.DataFrame(ohe.fit_transform(X_merged), columns=ohe.get_feature_names_out(), index=X_merged.index)

X_train  = pd.DataFrame(ohe.transform(X_train), columns=ohe.get_feature_names_out(), index=X_train.index)
X_test = pd.DataFrame(ohe.transform(X_test), columns=ohe.get_feature_names_out(), index=X_test.index)

In [9]:
X_train.head()

Unnamed: 0,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,sex_Female,sex_Male,age,admissiontypeid,dischargedispositionid,...,A1Cresult,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,75.0,2.0,1.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,85.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,65.0,1.0,18.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,55.0,6.0,1.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,45.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
y_train.head()

0    0
1    0
2    1
3    0
4    1
dtype: int64

### Train a model

In [11]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

## Accuracy

### Measure Overall Accuracy

In [12]:
accuracy_score(y_test, y_pred)

0.6273070469798657

### Per gender accuracy

In [13]:
X_test_male = X_test[X_test['sex_Male']==1]
y_test_male = y_test[X_test['sex_Male']==1]

y_pred_male = rf.predict(X_test_male)

X_test_female = X_test[X_test['sex_Female']==1]
y_test_female = y_test[X_test['sex_Female']==1]

y_pred_female = rf.predict(X_test_female)

print(accuracy_score(y_test_male, y_pred_male), accuracy_score(y_test_female, y_pred_female))

0.6382850241545893 0.6178385416666666


### Per race accuracy

In [14]:
race_cols = ['race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other']

for race in race_cols:
    X_test_race = X_test[X_test[race]==1]
    y_test_race = y_test[X_test[race]==1]
    
    y_pred_race = rf.predict(X_test_race)
    
    print(race, accuracy_score(y_test_race, y_pred_race))

race_AfricanAmerican 0.6073198627525734
race_Asian 0.7010309278350515
race_Caucasian 0.625925578779642
race_Hispanic 0.6655629139072847
race_Other 0.7063621533442088


### Let's calculate Disparate Impact 
### selection rates: P(readmitted)/P(total)

In [15]:
total_male = len(y_pred_male)
total_female = len(y_pred_female)
passrate_male = sum(y_pred_male)/total_male
passrate_female = sum(y_pred_female)/total_female

print(min(passrate_male,passrate_female)/max(passrate_male,passrate_female))

0.9174209140598115


#### This value is greater than 80%, so that is good! (According to EEOC's 4/5th rule)

### What about race?

In [54]:
# Do in class

#### Let's look at intersectionality

In [55]:
# Do in class

#### As soon as we introduced intersectionality, we noticed that DI got worse! This is often an issue. Fairness becomes harder with smaller groups

## Repeat the above exercise but for false positive rate difference.

#### For Gender

In [18]:
gender_cols = ['sex_Male', 'sex_Female']
fprates = []
for gender in gender_cols:
    X_test_gender = X_test[X_test[gender]==1]
    y_test_gender = y_test[X_test[gender]==1]
    
    y_pred_gender = rf.predict(X_test_gender)

    total_gender = len(y_pred_gender)
    
    fpcount = 0
    for y,yhat in zip(y_test_gender,y_pred_gender):
        if y==0 and yhat ==1:
            fpcount +=1
    fprate = fpcount/len(y_pred_gender)
    
    print('False positive rate for', gender, fprate)
    fprates.append(fprate)
        
print('Difference:', fprates[0]-fprates[1])

False positive rate for sex_Male 0.09510869565217392
False positive rate for sex_Female 0.10260416666666666
Difference: -0.007495471014492744


#### Homework: FPR For Race

#### Homework: FPR For Intersectional (Race x Gender)

## Let's look at Aequitas

In [43]:
! pip install aequitas



In [32]:
from aequitas.preprocessing import preprocess_input_df
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
import aequitas.plot as ap

In [33]:
aqdf = test_df.copy()
aqdf['score'] = y_pred
aqdf['label_value'] = aqdf['readmitted']

In [34]:
aqdf

Unnamed: 0,race,sex,age,admissiontypeid,dischargedispositionid,admissionsourceid,timeinhospital,numlabprocedures,numprocedures,nummedications,...,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted,score,label_value
0,Caucasian,Male,45,1,1,7,6,41,0,27,...,2,0,0,0,1,1,1,1,0,1
1,AfricanAmerican,Female,65,3,3,1,4,62,2,16,...,0,0,0,0,2,0,1,1,0,1
2,Caucasian,Male,55,1,1,7,3,39,0,18,...,0,0,0,0,1,1,1,1,0,1
3,Other,Female,85,2,3,7,10,70,1,26,...,0,2,0,0,2,1,1,1,1,1
4,AfricanAmerican,Male,65,1,1,7,6,43,3,25,...,2,0,0,0,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14299,Caucasian,Female,25,3,1,1,2,39,2,23,...,0,0,0,0,2,0,1,0,0,0
14300,AfricanAmerican,Female,75,3,1,1,3,43,0,5,...,0,0,0,0,2,1,1,0,1,0
14301,Caucasian,Female,55,1,5,6,3,44,1,23,...,2,0,0,0,0,0,1,1,1,1
14302,AfricanAmerican,Male,75,1,1,7,3,35,0,11,...,0,0,2,0,0,0,1,0,1,0


In [35]:
attributes_and_reference_groups={'race':'Caucasian', 'sex':'Male'}
attributes_to_audit = list(attributes_and_reference_groups.keys())

In [49]:
metrics = ['precision','fpr']
disparity_tolerance = 1.25

In [50]:
# Initialize Aequitas
g = Group()
b = Bias()

# get_crosstabs returns a dataframe of the group counts and group value bias metrics.
xtab, _ = g.get_crosstabs(aqdf, attr_cols=attributes_to_audit)
bdf = b.get_disparity_predefined_groups(xtab, original_df=aqdf, ref_groups_dict=attributes_and_reference_groups)

get_disparity_predefined_group()


In [53]:
ap.summary(bdf, metrics, fairness_threshold = disparity_tolerance)

In [51]:
ap.disparity(bdf, metrics, 'race', fairness_threshold = disparity_tolerance)

In [52]:
ap.absolute(bdf, metrics, 'race', fairness_threshold = disparity_tolerance)

### Disparities for all metrics

#### Disparities Calcuated:

| Metric | Column Name |
| --- | --- |
| True Positive Rate Disparity | 'tpr_disprity' |
| True Negative Rate | 'tnr_disparity' |
| False Omission Rate | 'for_disparity' |
| False Discovery Rate | 'fdr_disparity' |
| False Positive Rate | 'fpr_disparity' |
| False NegativeRate | 'fnr_disparity' |
| Negative Predictive Value | 'npv_disparity' |
| Precision Disparity | 'precision_disparity' |
| Predicted Positive Ratio$_k$ Disparity | 'ppr_disparity' |
| Predicted Positive Ratio$_g$ Disparity | 'pprev_disparity' |


Columns for each disparity are appended to the crosstab dataframe, along with a column indicating the reference group for each calculated metric (denoted by `[METRIC NAME]_ref_group_value`). We see a slice of the dataframe with calculated metrics in the next section.

In [46]:
bdf[['attribute_name', 'attribute_value'] + b.list_disparities(bdf)]

Unnamed: 0,attribute_name,attribute_value,ppr_disparity,pprev_disparity,precision_disparity,fdr_disparity,for_disparity,fpr_disparity,fnr_disparity,tpr_disparity,tnr_disparity,npv_disparity
0,race,AfricanAmerican,0.177535,0.72212,0.866285,1.181615,1.031953,0.825819,1.183012,0.656734,1.038049,0.982218
1,race,Asian,0.002639,0.29021,0.992147,1.010667,0.808002,0.249751,1.328728,0.383421,1.163889,1.106848
2,race,Caucasian,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,race,Hispanic,0.016585,0.585912,0.907589,1.125515,0.867265,0.58861,1.190794,0.642136,1.089867,1.073868
4,race,Other,0.026008,0.452663,0.905873,1.127845,0.755788,0.428062,1.231545,0.565703,1.124938,1.135905
5,sex,Female,1.263782,1.090012,1.008063,0.989722,1.075297,1.124605,0.985249,1.03244,0.97708,0.961176
6,sex,Male,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Actual Values for all metrics

In [47]:
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[['attribute_name', 'attribute_value'] + absolute_metrics]

Unnamed: 0,attribute_name,attribute_value,tpr,tnr,for,fdr,fpr,fnr,npv,precision,ppr,pprev,prev
0,race,AfricanAmerican,0.228377,0.851945,0.368959,0.501062,0.148055,0.771623,0.631041,0.498938,0.145191,0.179565,0.392299
1,race,Asian,0.133333,0.955224,0.288889,0.428571,0.044776,0.866667,0.711111,0.571429,0.002158,0.072165,0.309278
2,race,Caucasian,0.347747,0.820717,0.357535,0.424048,0.179283,0.652253,0.642465,0.575952,0.817818,0.248664,0.411847
3,race,Hispanic,0.223301,0.894472,0.310078,0.477273,0.105528,0.776699,0.689922,0.522727,0.013564,0.145695,0.34106
4,race,Other,0.196721,0.923256,0.270221,0.478261,0.076744,0.803279,0.729779,0.521739,0.02127,0.112561,0.298532
5,sex,Female,0.322713,0.825277,0.36582,0.435119,0.174723,0.677287,0.63418,0.564881,0.558261,0.235807,0.41276
6,sex,Male,0.312573,0.844636,0.340204,0.439637,0.155364,0.687427,0.659796,0.560363,0.441739,0.216335,0.387832


### Homework: Repeat the above exercise (using Aequitas) to plot False Omission Rates for gender and race. If you find noticeable disparities, discuss what false omission rate disparity means in a medical context.

### Bonus homework question: How would you modify the above code to show plots for intersectional (race x gender) groups? Pick any one metric of your choice for the answer