# In-processing

In [11]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

from sklego.metrics import equal_opportunity_score
from sklego.metrics import p_percent_score
from sklearn.metrics import accuracy_score
import dalex as dx

In [2]:
df = pd.read_csv('../../data/final_features_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Income,faves_pca0,faves_pca1,unfaves_pca0,unfaves_pca1,accessories,alcohol,animamted,...,Drama.2,Entertainment (Variety Shows),Factual,Learning,Music,News,Religion &amp; Ethics,Sport.1,Weather,Rating_bin
0,0,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,1,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,2,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,3,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,4,62,1,-0.321485,0.0786,-0.19967,-0.200645,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
df_0 = df.fillna(0)

In [5]:
X = df_0.drop(columns='Rating_bin')
y = df_0['Rating_bin']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

## Baseline model: Decision Tree

In [6]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))
confusion_matrix(y_val, y_pred)

              precision    recall  f1-score   support

           0       0.90      0.91      0.90      7775
           1       0.40      0.39      0.39      1255

    accuracy                           0.83      9030
   macro avg       0.65      0.65      0.65      9030
weighted avg       0.83      0.83      0.83      9030



array([[7049,  726],
       [ 769,  486]])

## Metrics

In [7]:
def statistical_parity(y, y_, Z, priv=None):
  if priv is None:
    values = np.unique(Z)
    counts = [np.mean(y[Z==z]) for z in values]
    priv = values[np.argmax(counts)]
    unpriv = [z for z in values if z != priv]
    print('Automatic priviledged value is', priv)
  else:
    unpriv = [z for z in values if z != priv]
  
  return np.array([np.mean([y_i for y_i, zi in zip(y_, Z) if zi == unp]) - np.mean([y_i for y_i, zi in zip(y_, Z) if zi == priv])
                   for unp in unpriv])

def average_odds(y, y_, Z, priv=None):
  if priv is None:
    values = np.unique(Z)
    counts = [np.mean(y[Z==z]) for z in values]
    priv = values[np.argmax(counts)]
    unpriv = [z for z in values if z != priv]
    print('Automatic priviledged value is', priv)
  else:
    unpriv = [z for z in values if z != priv]
  
  return np.array([1/2*(np.mean([y_i for y_i, yi, zi in zip(y_, y, Z) if zi == unp and yi == 1]) - 
                           np.mean([y_i for y_i, yi, zi in zip(y_, y, Z) if zi == priv  and yi == 1]))+\
                   1/2*(np.mean([y_i for y_i, yi, zi in zip(y_, y, Z) if zi == unp and yi == 0]) - 
                           np.mean([y_i for y_i, yi, zi in zip(y_, y, Z) if zi == priv  and yi == 0]))
                   for unp in unpriv])
  
def average_predictive_value(y, y_, Z, priv=None):
  if priv is None:
    values = np.unique(Z)
    counts = [np.mean(y[Z==z]) for z in values]
    priv = values[np.argmax(counts)]
    unpriv = [z for z in values if z != priv]
    print('Automatic priviledged value is', priv)
  else:
    unpriv = [z for z in values if z != priv]
  
  return np.array([1/2*(np.mean([yi for y_i, yi, zi in zip(y_, y, Z) if zi == unp and y_i == 1]) - 
                           np.mean([yi for y_i, yi, zi in zip(y_, y, Z) if zi == priv  and y_i == 1]))+\
                   1/2*(np.mean([yi for y_i, yi, zi in zip(y_, y, Z) if zi == unp and y_i == 0]) - 
                           np.mean([yi for y_i, yi, zi in zip(y_, y, Z) if zi == priv  and y_i == 0]))
                   for unp in unpriv])
  
def consistency(X, y_, k, distance=lambda x: np.linalg.norm(x, 1)):
  D_matrix = np.array([[distance(xi-xj) for xj in X] for xi in X])
  N = np.argsort(D_matrix+np.eye(D_matrix.shape[0])*10**10, axis=0)[:, :k]
  i_consist = [abs(y_[i]-np.mean([y_[N[i,j]] for j in range(k)])) for i in range(y_.shape[0])]
  return 1 - np.mean(i_consist)

def theil_index(y, y_):
  b = (1-y+y_)/2
  b_ = np.mean(b)
  return np.mean(b/b_*np.log(b/b_+10**-10))

In [8]:
Z_train = X_train['Gender_M']
print('Statistical parity from data', statistical_parity(y_train, y_train, Z_train))

Automatic priviledged value is 1
Statistical parity from data [-0.02097122]


In [13]:
y_train_ = clf.predict(X_train)

In [14]:
print('Accuracy', accuracy_score(y_train, y_train_))
print('Statistical parity', statistical_parity(y_train, y_train_, Z_train))
print('Average odds', average_odds(y_train, y_train_, Z_train))
print('Average predictive value',average_predictive_value(y_train, y_train_, Z_train))
print('Consistency', consistency(X_train, y_train_, 5))
print('Theil index', theil_index(y_train, y_train_))

Accuracy 1.0
Automatic priviledged value is 1
Statistical parity [-0.02097122]
Automatic priviledged value is 1
Average odds [0.]
Automatic priviledged value is 1
Average predictive value [0.]


TypeError: unsupported operand type(s) for -: 'str' and 'str'