In [214]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np
from hessians_calc import *
from demo_parity_calc import *
import copy


In [215]:
cols = ['status', 'duration', 'credit_hist', 'purpose', 'credit_amt', 'savings', 'employment', 'installment_rate', 'personal_status', 'debtors', 'residencesince', 'property', 'age', 'install_plans', 'housing', 'existing_credits', 'job', 'maintenance_paying_people', 'telephone', 'foreign_worker', 'result']
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
df = pd.read_csv(url, delimiter=' ', names=cols, index_col=False)
df['result'] = df['result'].replace(2,0)

In [216]:
df.head()

Unnamed: 0,status,duration,credit_hist,purpose,credit_amt,savings,employment,installment_rate,personal_status,debtors,...,property,age,install_plans,housing,existing_credits,job,maintenance_paying_people,telephone,foreign_worker,result
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,0
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,0


In [217]:
def preprocess_german(df):
    df['status'] = df['status'].map({'A11': 0, 'A12': 1, 'A13': 2, 'A14': 3}).astype(int)
    df['credit_hist'] = df['credit_hist'].map({'A34': 0, 'A33': 1, 'A32': 2, 'A31': 3, 'A30': 4}).astype(int)
    df.loc[(df['credit_amt'] <= 2000), 'credit_amt'] = 0
    df.loc[(df['credit_amt'] > 2000) & (df['credit_amt'] <= 5000), 'credit_amt'] = 1
    df.loc[(df['credit_amt'] > 5000), 'credit_amt'] = 2
    df.loc[(df['duration'] <= 12), 'duration'] = 0
    df.loc[(df['duration'] > 12) & (df['duration'] <= 24), 'duration'] = 1
    df.loc[(df['duration'] > 24) & (df['duration'] <= 36), 'duration'] = 2
    df.loc[(df['duration'] > 36), 'duration'] = 3
    df['age'] = df['age'].apply(lambda x : 1 if x >= 45 else 0) # 1 if old, 0 if young
    df['savings'] = df['savings'].map({'A61': 0, 'A62': 1, 'A63': 2, 'A64': 3, 'A65': 4}).astype(int)
    df['employment'] = df['employment'].map({'A71': 0, 'A72': 1, 'A73': 2, 'A74': 3, 'A75': 4}).astype(int)
    df['gender'] = df['personal_status'].map({'A91': 1, 'A92': 0, 'A93': 1, 'A94': 1, 'A95': 0}).astype(int)
    df['debtors'] = df['debtors'].map({'A101': 0, 'A102': 1, 'A103': 2}).astype(int)
    df['property'] = df['property'].map({'A121': 3, 'A122': 2, 'A123': 1, 'A124': 0}).astype(int)
    df['install_plans'] = df['install_plans'].map({'A141': 1, 'A142': 1, 'A143': 0}).astype(int)
    df['job'] = df['job'].map({'A171': 0, 'A172': 1, 'A173': 2, 'A174': 3}).astype(int)
    df['telephone'] = df['telephone'].map({'A191': 0, 'A192': 1}).astype(int)
    df['foreign_worker'] = df['foreign_worker'].map({'A201': 1, 'A202': 0}).astype(int)
    pd.get_dummies(df, columns=['purpose', 'housing'], drop_first=True)

    return df

In [218]:
df = preprocess_german(df)

In [219]:
df = df.drop(["purpose", "housing", "personal_status"], axis=1)
x = df[['status', 'duration', 'credit_hist', 'credit_amt', 'savings',
       'employment', 'installment_rate', 'debtors', 'residencesince',
       'property', 'age', 'install_plans', 'existing_credits', 'job',
       'maintenance_paying_people', 'telephone', 'foreign_worker',
       'gender']]

y = df[['result']]

In [220]:
len(y)

1000

In [221]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=15)
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [222]:
x_test_orig = copy.deepcopy(x_test)

In [223]:
def demographic_parity(x_train, y_train, x_test, y_test):
    # find the predictive probabilities for the protected and privileged groups based on 'age'
 
    logistic_model = LogisticRegression(max_iter=1000)
    logistic_model.fit(x_train, y_train.values.ravel())

    x_test = x_test.reset_index(drop=True)
    y_pred_proba = logistic_model.predict_proba(x_test)

    privileged_gp_indices = x_test[x_test['age']==1].index
    protected_gp_indices = x_test[x_test['age']==0].index

    print(protected_gp_indices)
    print(privileged_gp_indices)

    # print(y_pred_proba[privileged_gp_indices, 1][y_pred_proba[privileged_gp_indices, 1] > 0.5])
    # print(y_pred_proba[privileged_gp_indices, 1][y_pred_proba[privileged_gp_indices, 1] < 0.5])
    # print(y_pred_proba[privileged_gp_indices])

    # print(y_pred_proba[protected_gp_indices, 1][y_pred_proba[protected_gp_indices, 1] > 0.5])
    # print(y_pred_proba[protected_gp_indices, 1][y_pred_proba[protected_gp_indices, 1] < 0.5])
    # print(y_pred_proba[protected_gp_indices])


    privileged_positive_pred = y_pred_proba[privileged_gp_indices, 1][y_pred_proba[privileged_gp_indices, 1] > 0.5].mean()
    protected_positive_pred = y_pred_proba[protected_gp_indices, 1][y_pred_proba[protected_gp_indices, 1] > 0.5].mean()

    return  privileged_positive_pred - protected_positive_pred


In [224]:
# print(demographic_parity(x_train, y_train, x_test, y_test))

In [225]:
print(x_train.shape)
print(y_train.shape)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

clf = eval('LogisticRegression')(input_size=x_train.shape[-1])
if isinstance(clf, LogisticRegression):
    loss_func = logistic_loss_torch


(800, 18)
(800, 1)


In [226]:
print(type(x_train))
print(type(y_train))
print(clf)
print(loss_func)

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
LogisticRegression(
  (lr): Linear(in_features=18, out_features=1, bias=True)
  (sm): Sigmoid()
)
<function logistic_loss_torch at 0x00000135B4FECA60>


In [227]:
print(type(x_test_orig))
print(type(x_test))
print(type(y_test))

print(x_test_orig.shape)
print(x_test.shape)
print(y_test.shape)


<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
(200, 18)
(200, 18)
(200, 1)


In [228]:
del_F_del_theta = del_predictive_parity_del_theta(clf, x_test_orig, x_test, y_test)

KeyError: 0

In [None]:
hessian_all_points = get_hessian_all_points(clf, x_train, y_train, loss_func)


[A

  0%|          | 0/800 [00:00<?, ?it/s]


KeyError: 0

In [None]:
del_L_del_theta = get_del_L_del_theta(clf, x_train, y_train, loss_func)

In [None]:
hinv_v, hinv = get_hinv_v(hessian_all_points, del_F_del_theta)