In [75]:
%matplotlib inline
# Load all necessary packages
import sys
import pandas as pd
sys.path.insert(1, "../")  

from sklearn.preprocessing import StandardScaler, MaxAbsScaler

import numpy as np
np.random.seed(0)

from aif360.datasets.compas_dataset import CompasDataset
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_compas
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric


from IPython.display import Markdown, display

#### Load dataset and set options

In [33]:
original_compas = load_preproc_data_compas()

In [28]:
df, attributes = original_dataset.convert_to_dataframe()

In [27]:
df.head()

Unnamed: 0,sex,race,age_cat=25 to 45,age_cat=Greater than 45,age_cat=Less than 25,priors_count=0,priors_count=1 to 3,priors_count=More than 3,c_charge_degree=F,c_charge_degree=M,two_year_recid
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
8,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
10,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
14,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [36]:
# print out some labels, names, etc.
display(Markdown("#### Compas Dataset shape"))
print(original_compas.features.shape)
display(Markdown("#### Favorable and unfavorable labels"))
print(original_compas.favorable_label, original_compas.unfavorable_label)
display(Markdown("#### Protected attribute names"))
print(original_compas.protected_attribute_names)
display(Markdown("#### Privileged and unprivileged protected attribute values"))
print(original_compas.privileged_protected_attributes, 
      original_compas.unprivileged_protected_attributes)
display(Markdown("#### Dataset feature names"))
print(original_compas.feature_names)

#### Compas Dataset shape

(5278, 10)


#### Favorable and unfavorable labels

0.0 1.0


#### Protected attribute names

['sex', 'race']


#### Privileged and unprivileged protected attribute values

[array([1.]), array([1.])] [array([0.]), array([0.])]


#### Dataset feature names

['sex', 'race', 'age_cat=25 to 45', 'age_cat=Greater than 45', 'age_cat=Less than 25', 'priors_count=0', 'priors_count=1 to 3', 'priors_count=More than 3', 'c_charge_degree=F', 'c_charge_degree=M']


In [37]:
privileged_groups = [{'race': 1, 'sex': 1}] #  Caucasian
unprivileged_groups = [{'race': 0, 'sex': 0}] #  Not Caucasian 

#### Metric for original training data

In [73]:
display(Markdown("#### Statistical Parity Difference or Mean Difference:\
                 $$Pr(Y = 1 | D = unprivileged) - Pr(Y = 1 | D = privileged)$$"))

# Metric for the original dataset
metric_orig_train = BinaryLabelDatasetMetric(original_compas, 
                                             unprivileged_groups=unprivileged_groups,
                                             privileged_groups=privileged_groups)
display(Markdown("#### Mean Difference - Original dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_train.mean_difference())

# metric_orig_test = BinaryLabelDatasetMetric(dataset_orig_test, 
#                                              unprivileged_groups=unprivileged_groups,
#                                              privileged_groups=privileged_groups)
# print("Test set: Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_test.mean_difference())

#### Statistical Parity Difference or Mean Difference:                 $$Pr(Y = 1 | D = unprivileged) - Pr(Y = 1 | D = privileged)$$

#### Mean Difference - Original dataset

Difference in mean outcomes between unprivileged and privileged groups = -0.202520


In [78]:
min_max_scaler = MaxAbsScaler()
original_compas.features = min_max_scaler.fit_transform(original_compas.features)
metric_scaled = BinaryLabelDatasetMetric(original_compas, 
                             unprivileged_groups=unprivileged_groups,
                             privileged_groups=privileged_groups)
display(Markdown("#### Scaled dataset - Verify that the scaling does not affect the group label statistics"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_scaled.mean_difference())

#### Scaled dataset - Verify that the scaling does not affect the group label statistics

Difference in mean outcomes between unprivileged and privileged groups = -0.202520
