# In-processing

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('../../data/final_features_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Income,faves_pca0,faves_pca1,unfaves_pca0,unfaves_pca1,accessories,alcohol,animamted,...,Drama.2,Entertainment (Variety Shows),Factual,Learning,Music,News,Religion &amp; Ethics,Sport.1,Weather,Rating_bin
0,0,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,1,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,2,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,3,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,4,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [3]:
df_0 = df.fillna(0)

In [4]:
X = df_0.drop(columns=['Rating_bin', 'Gender_F', 'Unnamed: 0'])
y = df_0['Rating_bin']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.85)

## Baseline model: Logistic Regression

In [5]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.94      0.64      0.76      4635
           1       0.26      0.76      0.39       783

    accuracy                           0.66      5418
   macro avg       0.60      0.70      0.58      5418
weighted avg       0.84      0.66      0.71      5418



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[2976, 1659],
       [ 187,  596]])

In [6]:
f1_score(y_test, y_pred)

0.39236339697169187

## Metrics

In [7]:
def statistical_parity(y, y_, Z, priv=None):
  if priv is None:
    values = np.unique(Z)
    counts = [np.mean(y[Z==z]) for z in values]
    priv = values[np.argmax(counts)]
    unpriv = [z for z in values if z != priv]
    print('Automatic priviledged value is', priv)
  else:
    unpriv = [z for z in values if z != priv]
  
  return np.array([np.mean([y_i for y_i, zi in zip(y_, Z) if zi == unp]) - np.mean([y_i for y_i, zi in zip(y_, Z) if zi == priv])
                   for unp in unpriv])


In [8]:
y_test_ = clf.predict(X_test)
Z_test = X_test['Gender_M'] == 1

In [9]:
print('Statistical parity', statistical_parity(y_test, y_test_, Z_test))

Automatic priviledged value is True
Statistical parity [-0.01180838]


## Unawareness

In [10]:
X = df_0.drop(columns=['Rating_bin', 'Gender_M', 'Gender_F', 'Unnamed: 0'])
y = df_0['Rating_bin']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = 0.85)

In [11]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.94      0.63      0.76      4635
           1       0.26      0.77      0.39       783

    accuracy                           0.65      5418
   macro avg       0.60      0.70      0.57      5418
weighted avg       0.84      0.65      0.70      5418



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[2927, 1708],
       [ 182,  601]])

In [12]:
print('Statistical parity', statistical_parity(y_test, y_pred, Z_test))

Automatic priviledged value is True
Statistical parity [0.00350642]


In [13]:
f1_score(y_test, y_pred)

0.38874514877102195

## Reweight

In [14]:
X = df_0.drop(columns=['Rating_bin', 'Gender_F', 'Unnamed: 0'])
y = df_0['Rating_bin']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = 0.85)

In [15]:
p_male = np.mean(X_train['Gender_M']==1)
p_female = np.mean(X_train['Gender_M']==0)
p_1 = np.mean(y_train==1)
p_0 = np.mean(y_train==0)
p_male_1 = np.mean((X_train['Gender_M']==1)&(y_train==1))
p_male_0 = np.mean((X_train['Gender_M']==1)&(y_train==0))
p_female_1 = np.mean((X_train['Gender_M']==0)&(y_train==1))
p_female_0 = np.mean((X_train['Gender_M']==0)&(y_train==0))

In [16]:
w = np.ones(X_train.shape[0])

In [17]:
w[(X_train['Gender_M']==1)&(y_train==1)]=p_male*p_1/p_male_1
w[(X_train['Gender_M']==1)&(y_train==0)]=p_male*p_0/p_male_0
w[(X_train['Gender_M']==0)&(y_train==1)]=p_female*p_1/p_female_1
w[(X_train['Gender_M']==0)&(y_train==0)]=p_female*p_0/p_female_0

In [18]:
aux = pd.DataFrame(columns = ['Gender_M', 'y', 'w'])
aux['Gender_M'] = X_train['Gender_M']
aux['y'] = y_train
aux['w'] = w
aux

Unnamed: 0,Gender_M,y,w
6067,0,0,0.992184
19006,0,0,0.992184
22061,0,0,0.992184
4201,1,1,0.915770
22429,0,0,0.992184
...,...,...,...
16850,1,0,1.014208
6265,0,1,1.054546
11284,1,0,1.014208
860,1,0,1.014208


In [19]:
clf = LogisticRegression(solver = 'liblinear', penalty = 'l1', class_weight='balanced')
clf.fit(X_train, y_train, sample_weight = w)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.94      0.62      0.74      4635
           1       0.25      0.78      0.38       783

    accuracy                           0.64      5418
   macro avg       0.60      0.70      0.56      5418
weighted avg       0.84      0.64      0.69      5418



array([[2854, 1781],
       [ 174,  609]])

In [20]:
print('Statistical parity', statistical_parity(y_test, y_pred, Z_test))

Automatic priviledged value is True
Statistical parity [0.02647863]


In [21]:
f1_score(y_test, y_pred)

0.3838638512448786