# Pre-processing techniques

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../../data/final_features_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Income,faves_pca0,faves_pca1,unfaves_pca0,unfaves_pca1,accessories,alcohol,animamted,...,Drama.2,Entertainment (Variety Shows),Factual,Learning,Music,News,Religion &amp; Ethics,Sport.1,Weather,Rating_bin
0,0,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,1,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,2,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,3,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,4,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [3]:
df_0 = df.fillna(0)

In [4]:
# df = df_0.sample(frac = 1)
# train_frac = 0.8
# test_frac = 0.1

# X_train = df[[c for c in df.columns if c != 'Rating_bin']].iloc[:int(len(df) * train_frac)].values
# y_train = df.Rating_bin.iloc[:int(len(df) * train_frac)].values

# X_test = df[[c for c in df.columns if c != 'Rating_bin']].iloc[int(len(df) * train_frac):int(len(df) * (train_frac+test_frac))].values
# y_test = df.Rating_bin.iloc[int(len(df) * train_frac):int(len(df) * (train_frac+test_frac))].values

# X_valid = df[[c for c in df.columns if c != 'Rating_bin']].iloc[int(len(df) * (train_frac+test_frac)):].values
# y_valid = df.Rating_bin.iloc[int(len(df) * (train_frac+test_frac)):].values

In [5]:
# pd.DataFrame(X_train).to_csv('../../data/X_train.csv')
# pd.DataFrame(y_train).to_csv('../../data/y_train.csv')

# pd.DataFrame(X_test).to_csv('../../data/X_test.csv')
# pd.DataFrame(y_test).to_csv('../../data/y_test.csv')

# pd.DataFrame(X_valid).to_csv('../../data/X_valid.csv')
# pd.DataFrame(y_valid).to_csv('../../data/y_valid.csv')

In [6]:
X = df_0.drop(columns='Rating_bin')
y = df_0['Rating_bin']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

## Baseline model: Decision Tree

In [7]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))
confusion_matrix(y_val, y_pred)

              precision    recall  f1-score   support

           0       0.90      0.91      0.90      7775
           1       0.40      0.39      0.39      1255

    accuracy                           0.83      9030
   macro avg       0.65      0.65      0.65      9030
weighted avg       0.83      0.83      0.83      9030



array([[7049,  726],
       [ 769,  486]])

## Reweight

In [8]:
from aif360.datasets import StandardDataset
from aif360.algorithms.preprocessing import Reweighing
from aif360.metrics import BinaryLabelDatasetMetric

pip install 'aif360[LawSchoolGPA]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'


In [9]:
privileged_groups = [{'Gender_M': 1}]
unprivileged_groups = [{'Gender_M': 0}]

In [10]:
df_0['Rating_bin'].value_counts()

0    31279
1     4841
Name: Rating_bin, dtype: int64

In [11]:
aif360_df = StandardDataset(
    df = df_0.drop('Gender_F', axis = 1),
    label_name = 'Rating_bin',
    protected_attribute_names = ['Gender_M'],
    favorable_classes = [0],
    privileged_classes = [df_0['Gender_M'], lambda x: x == 1]
)

In [12]:
df_train, df_test = aif360_df.split([0.7], shuffle=True)

In [13]:
RW = Reweighing(unprivileged_groups=unprivileged_groups,
               privileged_groups=privileged_groups)

In [14]:
RW.fit(df_train)
dataset_transf = RW.transform(df_train)

In [15]:
X = dataset_transf.features
y = dataset_transf.labels
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [16]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))
confusion_matrix(y_val, y_pred)

              precision    recall  f1-score   support

         0.0       0.91      0.91      0.91      5467
         1.0       0.44      0.44      0.44       854

    accuracy                           0.85      6321
   macro avg       0.68      0.68      0.68      6321
weighted avg       0.85      0.85      0.85      6321



array([[4991,  476],
       [ 477,  377]])

In [17]:
metric_orig = BinaryLabelDatasetMetric(df_train, 
                                       unprivileged_groups=unprivileged_groups,
                                       privileged_groups=privileged_groups)

print("Difference in mean outcomes between unprivileged and privileged groups in original dataset = %f" % metric_orig.mean_difference())

Difference in mean outcomes between unprivileged and privileged groups in original dataset = 0.021122


In [20]:
metric_orig = BinaryLabelDatasetMetric(dataset_transf, 
                                       unprivileged_groups=unprivileged_groups,
                                       privileged_groups=privileged_groups)

print("Difference in mean outcomes between unprivileged and privileged groups in transformed dataset = %f" % metric_orig.mean_difference())

Difference in mean outcomes between unprivileged and privileged groups in transformed dataset = 0.000000
