# Pre-processing techniques

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score

from aif360.datasets import StandardDataset
from aif360.algorithms.preprocessing import Reweighing, DisparateImpactRemover
from aif360.metrics import BinaryLabelDatasetMetric

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'


In [2]:
df = pd.read_csv('../../data/final_features_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Income,faves_pca0,faves_pca1,unfaves_pca0,unfaves_pca1,accessories,alcohol,animamted,...,Drama.2,Entertainment (Variety Shows),Factual,Learning,Music,News,Religion &amp; Ethics,Sport.1,Weather,Rating_bin
0,0,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,1,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,2,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,3,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,4,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [3]:
df_0 = df.fillna(0)

In [4]:
privileged_groups = [{'Gender_M': 1}]
unprivileged_groups = [{'Gender_M': 0}]

In [5]:
df_0['Rating_bin'].value_counts()

0    31279
1     4841
Name: Rating_bin, dtype: int64

In [6]:
df_0.columns

Index(['Unnamed: 0', 'Age', 'Income', 'faves_pca0', 'faves_pca1',
       'unfaves_pca0', 'unfaves_pca1', 'accessories', 'alcohol', 'animamted',
       ...
       'Drama.2', 'Entertainment (Variety Shows)', 'Factual', 'Learning',
       'Music', 'News', 'Religion &amp; Ethics', 'Sport.1', 'Weather',
       'Rating_bin'],
      dtype='object', length=515)

In [7]:
aif360_df = StandardDataset(
    df = df_0.drop(['Gender_F', 'Unnamed: 0'], axis = 1),
    label_name = 'Rating_bin',
    protected_attribute_names = ['Gender_M'],
    favorable_classes = [0],
    privileged_classes = [df_0['Gender_M'], lambda x: x == 1]
)

In [8]:
df_train, df_test = aif360_df.split([0.85], shuffle=True, seed = 42)

In [9]:
X_train = df_train.features
y_train = df_train.labels

X_test = df_test.features
y_test = df_test.labels

In [10]:
metric_orig = BinaryLabelDatasetMetric(aif360_df, 
                                       unprivileged_groups=unprivileged_groups,
                                       privileged_groups=privileged_groups,
                                       )

print("Statistical Parity Difference between unprivileged and privileged groups in original dataset = %f" % metric_orig.statistical_parity_difference())

Statistical Parity Difference between unprivileged and privileged groups in original dataset = 0.018700


In [11]:
df_conv = aif360_df.convert_to_dataframe()[0]

In [12]:
df_train.convert_to_dataframe()[0].shape

(30702, 513)

In [13]:
X_train.shape

(30702, 512)

In [14]:
from aif360.sklearn.metrics import statistical_parity_difference
statistical_parity_difference(y_test, y_test, prot_attr= df_test.convert_to_dataframe()[0]['Gender_M'] == 1)

-0.0154855124915005

In [15]:
def statistical_parity(y, y_, Z, priv=None):
  if priv is None:
    values = np.unique(Z)
    counts = [np.mean(y[Z==z]) for z in values]
    priv = values[np.argmax(counts)]
    unpriv = [z for z in values if z != priv]
    print('Automatic priviledged value is', priv)
  else:
    unpriv = [z for z in values if z != priv]
  
  return np.array([np.mean([y_i for y_i, zi in zip(y_, Z) if zi == unp]) - np.mean([y_i for y_i, zi in zip(y_, Z) if zi == priv])
                   for unp in unpriv])


In [16]:
statistical_parity(y_test, y_test, df_test.convert_to_dataframe()[0]['Gender_M'])

Automatic priviledged value is 1.0


array([-0.01548551])

It's interesting to note the differences between the ways to calculate Statistical Parity Difference. This notebook is left with all the previous cells intentionally.

## Baseline model: Decision Tree

In [17]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.88      1.00      0.93      4738
         1.0       0.58      0.04      0.08       680

    accuracy                           0.88      5418
   macro avg       0.73      0.52      0.51      5418
weighted avg       0.84      0.88      0.83      5418



array([[4717,   21],
       [ 651,   29]])

In [18]:
f1_score(y_test, y_pred)

0.07945205479452055

## Reweight

In [19]:
RW = Reweighing(unprivileged_groups=unprivileged_groups,
               privileged_groups=privileged_groups)

In [20]:
RW.fit(aif360_df)
train_transf = RW.transform(df_train)
test_transf = RW.transform(df_test)

In [21]:
X_train = train_transf.features
y_train = train_transf.labels

X_test = test_transf.features
y_test = test_transf.labels

In [22]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.88      1.00      0.93      4738
         1.0       0.58      0.04      0.08       680

    accuracy                           0.88      5418
   macro avg       0.73      0.52      0.51      5418
weighted avg       0.84      0.88      0.83      5418



array([[4717,   21],
       [ 651,   29]])

In [23]:
f1_score(y_test, y_pred)

0.07945205479452055

In [24]:
statistical_parity(y_test, y_pred, test_transf.convert_to_dataframe()[0]['Gender_M'])

Automatic priviledged value is 1.0


array([0.0142572])

## Disparate Impact Remover

In [25]:
di = DisparateImpactRemover(repair_level = 1.0)
# dataset_transf_train = di.fit(aif360_df)

train_transf = di.fit_transform(df_train)
test_transf = di.fit_transform(df_test)

In [26]:
X_train = train_transf.features
y_train = train_transf.labels

X_test = test_transf.features
y_test = test_transf.labels

In [27]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

         0.0       0.88      1.00      0.93      4738
         1.0       0.58      0.04      0.08       680

    accuracy                           0.88      5418
   macro avg       0.73      0.52      0.51      5418
weighted avg       0.84      0.88      0.83      5418



array([[4717,   21],
       [ 651,   29]])

In [28]:
f1_score(y_test, y_pred)

0.07945205479452055

In [29]:
statistical_parity(y_test, y_pred, df_test.convert_to_dataframe()[0]['Gender_M'])

Automatic priviledged value is 1.0


array([0.0142572])