# Pre-processing techniques

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

from aif360.datasets import StandardDataset
from aif360.algorithms.preprocessing import Reweighing, DisparateImpactRemover
from aif360.metrics import BinaryLabelDatasetMetric

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'


In [2]:
df = pd.read_csv('../../data/final_features_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Income,faves_pca0,faves_pca1,unfaves_pca0,unfaves_pca1,accessories,alcohol,animamted,...,Drama.2,Entertainment (Variety Shows),Factual,Learning,Music,News,Religion &amp; Ethics,Sport.1,Weather,Rating_bin
0,0,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,1,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,2,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,3,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,4,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [3]:
df_0 = df.fillna(0)

In [4]:
privileged_groups = [{'Gender_M': 1}]
unprivileged_groups = [{'Gender_M': 0}]

In [5]:
df_0['Rating_bin'].value_counts()

0    31279
1     4841
Name: Rating_bin, dtype: int64

In [7]:
df_0.columns

Index(['Unnamed: 0', 'Age', 'Income', 'faves_pca0', 'faves_pca1',
       'unfaves_pca0', 'unfaves_pca1', 'accessories', 'alcohol', 'animamted',
       ...
       'Drama.2', 'Entertainment (Variety Shows)', 'Factual', 'Learning',
       'Music', 'News', 'Religion &amp; Ethics', 'Sport.1', 'Weather',
       'Rating_bin'],
      dtype='object', length=515)

In [33]:
aif360_df = StandardDataset(
    df = df_0.drop(['Gender_F', 'Unnamed: 0'], axis = 1),
    label_name = 'Rating_bin',
    protected_attribute_names = ['Gender_M'],
    favorable_classes = [0],
    privileged_classes = [df_0['Gender_M'], lambda x: x == 1]
)

In [34]:
df_train, df_test = aif360_df.split([0.7], shuffle=True, seed = 42)

In [35]:
X = aif360_df.features
y = aif360_df.labels
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

In [36]:
metric_orig = BinaryLabelDatasetMetric(aif360_df, 
                                       unprivileged_groups=unprivileged_groups,
                                       privileged_groups=privileged_groups,
                                       )

print("Statistical Parity Difference between unprivileged and privileged groups in original dataset = %f" % metric_orig.statistical_parity_difference())

Statistical Parity Difference between unprivileged and privileged groups in original dataset = 0.018700


In [37]:
df_conv = aif360_df.convert_to_dataframe()[0]

In [43]:
df_train.convert_to_dataframe()[0].shape

(25284, 513)

In [45]:
X_train.shape

(27090, 512)

In [41]:
from aif360.sklearn.metrics import statistical_parity_difference
statistical_parity_difference(y_train, y_train, prot_attr= df_train.convert_to_dataframe()[0]['Gender_M'] == 1)

ValueError: Length mismatch: Expected 27090 rows, received array of length 25284

In [46]:
def statistical_parity(y, y_, Z, priv=None):
  if priv is None:
    values = np.unique(Z)
    counts = [np.mean(y[Z==z]) for z in values]
    priv = values[np.argmax(counts)]
    unpriv = [z for z in values if z != priv]
    print('Automatic priviledged value is', priv)
  else:
    unpriv = [z for z in values if z != priv]
  
  return np.array([np.mean([y_i for y_i, zi in zip(y_, Z) if zi == unp]) - np.mean([y_i for y_i, zi in zip(y_, Z) if zi == priv])
                   for unp in unpriv])


In [48]:
statistical_parity(y_train, y_train, df_train.convert_to_dataframe()[0]['Gender_M'])

IndexError: boolean index did not match indexed array along dimension 0; dimension is 27090 but corresponding boolean dimension is 25284

In [None]:
aif360_df.convert_to_dataframe().Gender_M

## Baseline model: Decision Tree

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))
confusion_matrix(y_val, y_pred)

In [None]:
f1_score(y_val, y_pred)

In [None]:
from aif360.sklearn.metrics import statistical_parity_difference
statistical_parity_difference(y_train, y_train, priv_group = 0, prot_attr = X_train['Gender_M'] == 1)

## Reweight

In [None]:
RW = Reweighing(unprivileged_groups=unprivileged_groups,
               privileged_groups=privileged_groups)

In [None]:
RW.fit(df_train)
dataset_transf = RW.transform(df_train)

In [None]:
X = dataset_transf.features
y = dataset_transf.labels
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))
confusion_matrix(y_val, y_pred)

In [None]:
f1_score(y_val, y_pred)

In [None]:
metric_orig = BinaryLabelDatasetMetric(dataset_transf, 
                                       unprivileged_groups=unprivileged_groups,
                                       privileged_groups=privileged_groups)

print("Statistical Parity Difference between unprivileged and privileged groups in transformed dataset = %f" % metric_orig.statistical_parity_difference())

## Disparate Impact Remover

In [None]:
di = DisparateImpactRemover(repair_level = 1.0)
dataset_transf_train = di.fit_transform(df_train)

In [None]:
X = dataset_transf_train.features
y = dataset_transf_train.labels
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))
confusion_matrix(y_val, y_pred)

In [None]:
f1_score(y_val, y_pred)

In [None]:
metric_orig = BinaryLabelDatasetMetric(dataset_transf_train, 
                                       unprivileged_groups=unprivileged_groups,
                                       privileged_groups=privileged_groups)

print("Statistical Parity Difference between unprivileged and privileged groups in transformed dataset = %f" % metric_orig.statistical_parity_difference())