In [221]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
from hessians_calc import *
from demo_parity_calc import *
import copy

In [222]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital', 'occupation',\
            'relationship', 'race', 'gender', 'capgain', 'caploss', 'hours', 'country', 'income']

In [223]:
def process_adult(df):
    # replace missing values (?) to nan and then drop the columns
    df['country'] = df['country'].replace(' ?',np.nan)
    df['workclass'] = df['workclass'].replace(' ?',np.nan)
    df['occupation'] = df['occupation'].replace(' ?',np.nan)
    # dropping the NaN rows now
    df.dropna(how='any',inplace=True)
    df['income'] = df['income'].map({' <=50K': 0, ' >50K': 1}).astype(int)
    df['age'] = df['age'].apply(lambda x : 1 if x >= 45 else 0) # 1 if old, 0 if young
    df['workclass'] = df['workclass'].map({' Never-worked': 0, ' Without-pay': 1, ' State-gov': 2, ' Local-gov': 3, ' Federal-gov': 4, ' Self-emp-inc': 5, ' Self-emp-not-inc': 6, ' Private': 7}).astype(int)
    df['education'] = df['education'].map({' Preschool': 0, ' 1st-4th': 1, ' 5th-6th': 2, ' 7th-8th': 3, ' 9th': 4, ' 10th': 5, ' 11th': 6, ' 12th': 7, ' HS-grad':8, ' Some-college': 9, ' Bachelors': 10, ' Prof-school': 11, ' Assoc-acdm': 12, ' Assoc-voc': 13, ' Masters': 14, ' Doctorate': 15}).astype(int)
    df['marital'] = df['marital'].map({' Married-civ-spouse': 2, ' Divorced': 1, ' Never-married': 0, ' Separated': 1, ' Widowed': 1, ' Married-spouse-absent': 2, ' Married-AF-spouse': 2}).astype(int)
    df['relationship'] = df['relationship'].map({' Wife': 1 , ' Own-child': 0 , ' Husband': 1, ' Not-in-family': 0, ' Other-relative': 0, ' Unmarried': 0}).astype(int)
    df['race'] = df['race'].map({' White': 1, ' Asian-Pac-Islander': 0, ' Amer-Indian-Eskimo': 0, ' Other': 0, ' Black': 0}).astype(int)
    df['gender'] = df['gender'].map({' Male': 1, ' Female': 0}).astype(int)
    # process hours
    df.loc[(df['hours'] <= 40), 'hours'] = 0
    df.loc[(df['hours'] > 40), 'hours'] = 1
    df = df.drop(columns=['fnlwgt', 'education.num', 'occupation', 'country', 'capgain', 'caploss'])
    df = df.reset_index(drop=True)
    return df


In [224]:
df_train = pd.read_csv('adult.data', names = cols, sep=",")

In [225]:
df_test = pd.read_csv('adult.test', names = cols, sep = ",")

In [226]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital,occupation,relationship,race,gender,capgain,caploss,hours,country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [227]:
# Check if 'income' column has missing values (NaN)
income_missing_values = df_train['income'].isna().any()

# Display if 'income' column has missing values
if income_missing_values:
    print("The 'income' column has missing values.")
else:
    print("The 'income' column does not have any missing values.")

The 'income' column does not have any missing values.


In [228]:
unique_income_values = df_train['income'].unique()

print(unique_income_values)

[' <=50K' ' >50K']


In [229]:
unique_income_values = df_train['education'].unique()

print(unique_income_values)

[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']


In [230]:
df_train = process_adult(df_train)

In [231]:
df_train.head()

Unnamed: 0,age,workclass,education,marital,relationship,race,gender,hours,income
0,0,2,10,0,0,1,1,0,0
1,1,6,10,2,1,1,1,0,0
2,0,7,8,1,0,1,1,0,0
3,1,7,6,2,1,0,1,0,0
4,0,7,10,2,1,0,0,0,0


In [232]:
x_train = df_train[['age','workclass','education','marital','relationship','race','gender','hours']]
y_train = df_train[['income']]

In [233]:
unique_income_values = df_test['income'].unique()

print(unique_income_values)

[nan ' <=50K.' ' >50K.']


In [234]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital,occupation,relationship,race,gender,capgain,caploss,hours,country,income
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [235]:
def process_adult_test(df):
    # replace missing values (?) to nan and then drop the columns
    df['country'] = df['country'].replace(' ?',np.nan)
    df['workclass'] = df['workclass'].replace(' ?',np.nan)
    df['occupation'] = df['occupation'].replace(' ?',np.nan)
    # dropping the NaN rows now
    df.dropna(how='any',inplace=True)
    df['age'] = df['age'].astype(int)
    df['income'] = df['income'].map({' <=50K.': 0, ' >50K.': 1}).astype(int)
    df['age'] = df['age'].apply(lambda x : 1 if x >= 45 else 0) # 1 if old, 0 if young
    df['workclass'] = df['workclass'].map({' Never-worked': 0, ' Without-pay': 1, ' State-gov': 2, ' Local-gov': 3, ' Federal-gov': 4, ' Self-emp-inc': 5, ' Self-emp-not-inc': 6, ' Private': 7}).astype(int)
    df['education'] = df['education'].map({' Preschool': 0, ' 1st-4th': 1, ' 5th-6th': 2, ' 7th-8th': 3, ' 9th': 4, ' 10th': 5, ' 11th': 6, ' 12th': 7, ' HS-grad':8, ' Some-college': 9, ' Bachelors': 10, ' Prof-school': 11, ' Assoc-acdm': 12, ' Assoc-voc': 13, ' Masters': 14, ' Doctorate': 15}).astype(int)
    df['marital'] = df['marital'].map({' Married-civ-spouse': 2, ' Divorced': 1, ' Never-married': 0, ' Separated': 1, ' Widowed': 1, ' Married-spouse-absent': 2, ' Married-AF-spouse': 2}).astype(int)
    df['relationship'] = df['relationship'].map({' Wife': 1 , ' Own-child': 0 , ' Husband': 1, ' Not-in-family': 0, ' Other-relative': 0, ' Unmarried': 0}).astype(int)
    df['race'] = df['race'].map({' White': 1, ' Asian-Pac-Islander': 0, ' Amer-Indian-Eskimo': 0, ' Other': 0, ' Black': 0}).astype(int)
    df['gender'] = df['gender'].map({' Male': 1, ' Female': 0}).astype(int)
    # process hours
    df.loc[(df['hours'] <= 40), 'hours'] = 0
    df.loc[(df['hours'] > 40), 'hours'] = 1
    df = df.drop(columns=['fnlwgt', 'education.num', 'occupation', 'country', 'capgain', 'caploss'])
    df = df.reset_index(drop=True)
    return df


In [236]:
df_test = process_adult_test(df_test)

In [237]:
df_test.head()

Unnamed: 0,age,workclass,education,marital,relationship,race,gender,hours,income
0,0,7,6,0,0,0,1,0.0,0
1,0,7,8,2,1,1,1,1.0,0
2,0,3,12,2,1,1,1,0.0,1
3,0,7,9,2,1,0,1,0.0,1
4,0,7,5,0,0,1,1,0.0,0


In [238]:
x_test = df_test[['age','workclass','education','marital','relationship','race','gender','hours']]
y_test = df_test[['income']]

In [244]:
x_test.head()

Unnamed: 0,age,workclass,education,marital,relationship,race,gender,hours
0,0,7,6,0,0,0,1,0.0
1,0,7,8,2,1,1,1,1.0
2,0,3,12,2,1,1,1,0.0
3,0,7,9,2,1,0,1,0.0
4,0,7,5,0,0,1,1,0.0


In [245]:
y_test.head()

Unnamed: 0,income
0,0
1,0
2,1
3,1
4,0


In [239]:
clf_name = 'LogisticRegression'

In [246]:
def demographic_parity(x_train_temp, y_train_temp, x_test_temp):
    # find the predictive probabilities for the protected and privileged groups based on 'age'
    sc = StandardScaler()
    x_test_scalar_temp = sc.fit_transform(x_test_temp)
    x_train_scalar_temp = sc.fit_transform(x_train_temp)
    y_train_scalar_temp = sc.fit_transform(y_train_temp)
    clf = eval(clf_name)(input_size=x_train_temp.shape[-1])
    clf.fit(x_train_scalar_temp, y_train_scalar_temp)

    y_pred_proba_temp = clf.predict_proba(x_test_scalar_temp)

    privileged_gp_indices = x_test_temp[x_test_temp['gender'] == 1].index
    protected_gp_indices = x_test_temp[x_test_temp['gender'] == 0].index  # male: 1, female: 0

    privileged_positive_pred = 0
    for x in range(len(privileged_gp_indices)):
        privileged_positive_pred += y_pred_proba_temp[privileged_gp_indices[x]]
    privileged_positive_pred /= len(privileged_gp_indices)

    protected_positive_pred = 0
    for x in range(len(protected_gp_indices)):
        protected_positive_pred += y_pred_proba_temp[protected_gp_indices[x]]
    protected_positive_pred /= len(protected_gp_indices)

    return  protected_positive_pred - privileged_positive_pred

In [243]:
dp_ori = demographic_parity(x_train, y_train, x_test)
print(dp_ori)

0.0002747911645170076
