In [None]:
import os

os.chdir('..')
os.makedirs('data/raw', exist_ok=True)

In [None]:
# download the raw datasets
import gdown

gdown.download('https://drive.google.com/file/d/1Wm9rRmHoo_QDDmDxfTB9_binlvpVWZQE/view?usp=share_link',
               'data/raw/adult.csv')
gdown.download('https://drive.google.com/file/d/1ZT9i-G6MwAxNW4BR7jbr5rzYFZEI0xvw/view?usp=share_link',
               'data/raw/arrhythmia.data')
gdown.download('https://drive.google.com/file/d/10SCpydGfBXLs0_4a9CSl6OfssMVHA6DL/view?usp=share_link',
               'data/raw/compas.csv')
gdown.download('https://drive.google.com/file/d/1xgKwn68gNHftZ2AtRQazZ7op-2f2Zdyv/view?usp=share_link',
               'data/raw/credit.csv')
gdown.download('https://drive.google.com/file/d/1bZXLqHoC0kp9QwLQKpwkFXJmAvfX1ouM/view?usp=share_link',
               'data/raw/drug.data')
gdown.download('https://drive.google.com/file/d/1g_N-7v_W0Xe4Tdxn4e6Vm3L0n6tUGJs2/view?usp=share_link',
               'data/raw/SouthGermanCredit.asc')
gdown.download('https://docs.google.com/spreadsheets/d/1vX9nM6A2MC5AIt3rXcExa5K6iFLgRG3o/edit?usp=share_link&ouid=115713880809429525240&rtpof=true&sd=true',
               'data/raw/taiwan.xls')


# Data Cleaning & Preparing

In [None]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from fair_robust_classifiers.datasets.data_utils import (split_label_sensitive_and_store_data,
                                                         print_eo_ratio,
                                                         plot_distributions_sunburst)

## Arrhythmia

In [None]:
base_columns = ['age', 'sex', 'Height_cm', 'Weight_kg', 'QRS_duration', 'PR_interval',
                'QT_interval', 'T_interval', 'P_interval', 'QRS', 'T', 'P', 'QRST', 'J', 'Heartrate_per_minute']

channels = ['DI','DII','DIII','AVR','AVL','AVF','V1','V2','V3','V4','V5','V6']

channel_info1 = ['avg_width_Q_wave', 'avg_width_R_wave', 'avg_width_S_wave', 'avg_width_R\'_wave', 'avg_width_S\'_wave',
                 'num_intrinsic_deflections',
                 'exist_ragged_R_wave', 'exist_diphasic_R_wave',
                 'exist_ragged_P_wave', 'exist_diphasic_P_wave',
                 'exist_ragged_T_wave', 'exist_diphasic_T_wave']
channel_info2 = ['ampl_JJ_wave','ampl_Q_wave', 'ampl_R_wave', 'ampl_S_wave',
                 'ampl_R\'_wave','ampl_S\'_wave','ampl_P_wave','ampl_T_wave',
                 'QRSA','QRSTA']

columns = base_columns + [f'{c}_{ci}' for ci in channel_info1 for c in channels] + [f'{c}_{ci}' for ci in channel_info2 for c in channels] + ['arrhythmiaType']
columns = {col:i for i, col in enumerate(columns)}
len(columns)

In [None]:
df = pd.read_csv("data/raw/arrhythmia.data", sep=',', header=None, index_col=None, na_values='?', decimal='.', names=columns.keys())
df

In [None]:
clean_df = df.dropna(axis='columns').drop(df[df['arrhythmiaType']==16].index)
clean_df

In [None]:
neg_mask = clean_df['arrhythmiaType'] == 1
clean_df.loc[neg_mask, 'hasArrhythmia'] = -1
clean_df.loc[~neg_mask, 'hasArrhythmia'] = 1
clean_df = clean_df.drop('arrhythmiaType', axis=1)
clean_df

In [None]:
print_eo_ratio(clean_df, 'hasArrhythmia', 'sex')

In [None]:
plot_distributions_sunburst(clean_df, 'hasArrhythmia', 'sex')

In [None]:
clean_df.to_csv('data/full_clean_arrythmia.csv', index=False)

In [None]:
split_label_sensitive_and_store_data(clean_df,
                                     labels = ['hasArrhythmia'],
                                     sensitives = ['sex', None],
                                     dataset_name = "arrhythmia")

## COMPAS

In [None]:
df = pd.read_csv("data/raw/compas.csv", sep=',', header=0, index_col=None, decimal='.', skipinitialspace=False)
df

In [None]:
# data filtering (refer to https://github.com/propublica/compas-analysis)
idx = np.full(len(df), True)

# If the charge date of a defendants Compas scored crime was not within 30 days
# from when the person was arrested, we assume that because of data quality reasons,
# that we do not have the right offense. 
idx &= (df["days_b_screening_arrest"]<=30) & (df["days_b_screening_arrest"]>=-30)

# We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all.
idx &= df["is_recid"] != -1

# In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' --
# will not result in Jail time are removed (only two of them).
idx &= df["c_charge_degree"] != "O" # F: felony, M: misconduct

# We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.
idx &= df["score_text"] != "NA"

# we will only consider blacks and whites for this analysis
idx &= df["race"].isin(["African-American", "Caucasian"])

df = df.loc[idx]
df

In [None]:
features = ["age_cat", "race", "sex", "priors_count", "c_charge_degree", 'two_year_recid']
str_map = {'age_cat':"ageCat", 'priors_count':"priorsCount",
           'c_charge_degree':"cChargeDegree", 'two_year_recid':'twoYearRecid'}
clean_df = df[features].dropna(axis='rows').rename(columns=str_map)
clean_df

In [None]:
# binarize target variables
neg_mask = clean_df['twoYearRecid'] == 0
clean_df.loc[neg_mask, 'twoYearRecid'] = -1
clean_df.loc[~neg_mask, 'twoYearRecid'] = 1
clean_df['twoYearRecid'] = clean_df['twoYearRecid'].astype(int)

# binarize sensitive variables
mask = clean_df['sex'] == 'Female'
clean_df.loc[mask, 'sex'] = 0  # female
clean_df.loc[~mask, 'sex'] = 1 # male
clean_df['sex'] = clean_df['sex'].astype(int)

mask = clean_df['race'] == 'African-American'
clean_df.loc[mask, 'race'] = 0  # African-American
clean_df.loc[~mask, 'race'] = 1 # Caucasian
clean_df['race'] = clean_df['race'].astype(int)

# binarize feature variables
neg_mask = clean_df['cChargeDegree'] == 'F'
clean_df.loc[neg_mask, 'cChargeDegree'] = 0
clean_df.loc[~neg_mask, 'cChargeDegree'] = 1
clean_df['cChargeDegree'] = clean_df['cChargeDegree'].astype(int)

clean_df

In [None]:
print_eo_ratio(clean_df, 'twoYearRecid', 'sex')

In [None]:
plot_distributions_sunburst(clean_df, 'twoYearRecid', 'sex')

In [None]:
print_eo_ratio(clean_df, 'twoYearRecid', 'race')

In [None]:
plot_distributions_sunburst(clean_df, 'twoYearRecid', 'race')

In [None]:
clean_df.to_csv('data/full_clean_compas.csv', index=False)

In [None]:
split_label_sensitive_and_store_data(clean_df,
                                     labels = ['twoYearRecid'],
                                     sensitives = ['sex', 'race', None],
                                     dataset_name = "compas")

## Adult

In [None]:
columns = {'age':0, 'workclass':1, 'fnlwgt':2, 'education':3, 'educationNum':4, 'maritalStatus':5,
           'occupation':6, 'relationship':7, 'race':8, 'sex':9, 'capitalGain':10, 'capitalLoss':11,
           'hoursPerWeek':12, 'nativeCountry':13, 'grossIncomeGEQ50k':14,}

df = pd.read_csv("data/raw/adult.csv", sep=',', header=0, index_col=None, na_values='', decimal='.')
df

In [None]:
# drop unused columns:
# we will not consider fnlwght for classification since its computed externally and 
# it highly predictive for the class (for details, see documentation of the adult data)
df = df.dropna(axis='rows').drop(['fnlwgt', 'relationship'], axis='columns')

# reducing dimensionality of some very sparse features
mask = df["education"].isin(["Preschool", "1st-4th", "5th-6th", "7th-8th"])
df.loc[mask, "education"] = "prim-middle-school"

mask = df["education"].isin(["9th", "10th", "11th", "12th"])
df.loc[mask, "education"] = "high-school"

# binarize sensitive variables
mask = df["nativeCountry"] == "United-States"
df.loc[mask, "nativeCountry"] = 1 # United-States
df.loc[~mask, "nativeCountry"] = 0 # Non-United-States
df['nativeCountry'] = df['nativeCountry'].astype(int)

mask = df["race"] == "White"
df.loc[mask, "race"] = 1 # White
df.loc[~mask, "race"] = 0 # Non-White
df['race'] = df['race'].astype(int)

mask = df["sex"] == "Male"
df.loc[mask, "sex"] = 1 # male
df.loc[~mask, "sex"] = 0 # female
df['sex'] = df['sex'].astype(int)

# binarize target variables
neg_mask = df['grossIncomeGEQ50k'].isin(['<=50K','<=50K.'])
df.loc[neg_mask, 'grossIncomeGEQ50k'] = -1
df.loc[~neg_mask, 'grossIncomeGEQ50k'] = 1
df['grossIncomeGEQ50k'] = df['grossIncomeGEQ50k'].astype(int)
df

In [None]:
print_eo_ratio(df, 'grossIncomeGEQ50k', 'race')

In [None]:
plot_distributions_sunburst(df, 'grossIncomeGEQ50k', 'race')

In [None]:
print_eo_ratio(df, 'grossIncomeGEQ50k', 'sex')

In [None]:
plot_distributions_sunburst(df, 'grossIncomeGEQ50k', 'sex')

In [None]:
print_eo_ratio(df, 'grossIncomeGEQ50k', 'nativeCountry')

In [None]:
plot_distributions_sunburst(df, 'grossIncomeGEQ50k', 'nativeCountry')

In [None]:
df.to_csv('data/full_clean_adult.csv', index=False)

In [None]:
split_label_sensitive_and_store_data(df,
                                     labels = ['grossIncomeGEQ50k'],
                                     sensitives = ['race', 'sex', 'nativeCountry', None],
                                     dataset_name = "adult")

## German

In [None]:
columns = ['statusCheckAccount', 'durationMonth', 'creditHistory', 'purpose', 'creditAmount',
           'savings', 'employmentDuration', 'installmentRate', 'statusSex', 'otherDebtors',
           'presentResidence', 'property', 'age', 'installmentPlans', 'housing', 'numberCredits',
           'job', 'peopleLiable', 'telephone', 'foreignWorker', 'creditRisk']

cat_ft = ['statusCheckAccount', 'creditHistory', 'purpose',
           'savings', 'employmentDuration', 'installmentRate', 'otherDebtors',
           'presentResidence', 'property', 'installmentPlans', 'housing', 'numberCredits',
           'job']

In [None]:
df = pd.read_csv("data/raw/SouthGermanCredit.asc", sep=' ',
                 header=0, index_col=None, names=columns).dropna(axis='rows')
df

In [None]:
df[cat_ft] = df[cat_ft].astype("category")
df.dtypes

In [None]:
# binarize target variable
neg_mask = df['creditRisk'] == 0
df.loc[neg_mask, 'creditRisk'] = -1
df.loc[~neg_mask, 'creditRisk'] = 1
df['creditRisk'] = df['creditRisk'].astype(int)

# binarize features variables
msk = df['telephone'] == 1
df.loc[msk, 'telephone'] = 0
df.loc[~msk, 'telephone'] = 1
df['telephone'] = df['telephone'].astype(int)

msk = df['peopleLiable'] == 2
df.loc[msk, 'peopleLiable'] = 0
df.loc[~msk, 'peopleLiable'] = 1
df['peopleLiable'] = df['peopleLiable'].astype(int)

# binarize sensitive variables
msk = df['foreignWorker'] == 1
df.loc[msk, 'foreignWorker'] = 0  # foreign
df.loc[~msk, 'foreignWorker'] = 1 # not-foreign
df['foreignWorker'] = df['foreignWorker'].astype(int)

# drop samples with unknown sex
df_sex = df.drop(df.loc[df['statusSex'] == 3].index, axis='rows')
msk = df_sex['statusSex'] == 4
df_sex.loc[msk, 'sex'] = 0  # female
df_sex.loc[~msk, 'sex'] = 1 # male
df_sex['sex'] = df_sex['sex'].astype(int)

# drop unused columns
df_frn = df.dropna(axis='rows').drop(['statusSex'], axis='columns')
df_sex = df_sex.dropna(axis='rows').drop(['statusSex'], axis='columns')

In [None]:
print_eo_ratio(df_frn, 'creditRisk', 'foreignWorker')

In [None]:
plot_distributions_sunburst(df_frn, 'creditRisk', 'foreignWorker')

In [None]:
print_eo_ratio(df_sex, 'creditRisk', 'foreignWorker')

In [None]:
plot_distributions_sunburst(df_sex, 'creditRisk', 'foreignWorker')

In [None]:
print_eo_ratio(df_sex, 'creditRisk', 'sex')

In [None]:
plot_distributions_sunburst(df_sex, 'creditRisk', 'sex')

In [None]:
df_frn.to_csv('data/full_clean_german.csv', index=False)
df_sex.to_csv('data/full_clean_germanSex.csv', index=False)

In [None]:
split_label_sensitive_and_store_data(df_frn,
                                     labels = ['creditRisk'],
                                     sensitives = ['foreignWorker', None],
                                     dataset_name = "german")

In [None]:
split_label_sensitive_and_store_data(df_sex,
                                     labels = ['creditRisk'],
                                     sensitives = ['foreignWorker','sex', None],
                                     dataset_name = "germanSex")

## Drug

In [None]:
columns = {'id':0, 'age':1, 'gender':2, 'education':3, 'country':4, 'ethnicity':5,
           'neuroticism':6, 'extraversion':7, 'openness':8, 'agreeableness':9,
           'conscientiousness':10, 'impulsiveness':11, 'sensationSeeing':12, 'alcohol':13, 'amphetamines':14,
           'amylNitrite':15, 'benzodiazepine':16, 'caffeine':17, 'cannabis':18, 'chocolate':19, 'cocaine':20,
           'crack':21, 'ecstasy':22, 'heroin':23, 'ketamine':24, 'legalHighs':25, 'lsd':26,
           'methadone':27, 'mushrooms':28, 'nicotine':29, 'semeron':30, 'volatileSubstance':31
          }

feat_cols = list(columns.keys())[:13]
drug_cols = list(columns.keys())[13:]

In [None]:
df = pd.read_csv("data/raw/drug.data", sep=',', header=None, index_col=None, decimal='.', names=columns.keys())
df

In [None]:
# binarize target variables
for drug_col in drug_cols:
    neg_mask = (df[drug_col] == 'CL0')# | (df[drug_col] == 'CL1')
    df.loc[neg_mask, drug_col] = -1 #'non-user'
    df.loc[~neg_mask, drug_col] = 1 #'user'
    df[drug_col] = df[drug_col].astype(int)

# binarize sensitive variables
msk = df['gender'] == 0.48246
df.loc[msk, 'gender'] = 0 # female
df.loc[~msk, 'gender'] = 1 # male
df['gender'] = df['gender'].astype(int)

msk = df['ethnicity'] == -0.31685
df.loc[msk, 'ethnicity'] = 1 # white
df.loc[~msk, 'ethnicity'] = 0 # non-white
df['ethnicity'] = df['ethnicity'].astype(int)

# drop unused columns
df = df.drop(['country', 'id'], axis=1).dropna(axis=0)
feat_cols.remove('country')
feat_cols.remove('id')
df

In [None]:
print_eo_ratio(df, 'heroin', 'gender')

In [None]:
plot_distributions_sunburst(df, 'heroin', 'gender')

In [None]:
print_eo_ratio(df, 'heroin', 'ethnicity')

In [None]:
plot_distributions_sunburst(df, 'heroin', 'ethnicity')

In [None]:
print_eo_ratio(df, 'amphetamines', 'gender')

In [None]:
plot_distributions_sunburst(df, 'amphetamines', 'gender')

In [None]:
print_eo_ratio(df, 'amphetamines', 'ethnicity')

In [None]:
plot_distributions_sunburst(df, 'amphetamines', 'ethnicity')

In [None]:
df.to_csv('data/full_clean_drug.csv', index=False)

In [None]:
split_label_sensitive_and_store_data(df[feat_cols+['heroin']],
                                     labels = ['heroin'],
                                     sensitives = ['gender', 'ethnicity', None],
                                     dataset_name = "drug")

In [None]:
split_label_sensitive_and_store_data(df[feat_cols+['amphetamines']],
                                     labels = ['amphetamines'],
                                     sensitives = ['gender', 'ethnicity', None],
                                     dataset_name = "drug")

## Credit

In [None]:
df = pd.read_csv("data/raw/credit.csv", sep=',', header=0, index_col=None)
df = df.drop('Single', axis='columns').dropna(axis='rows')
df

In [None]:
neg_mask = df['NoDefaultNextMonth'] == 0
df.loc[neg_mask, 'NoDefaultNextMonth'] = -1
df.loc[~neg_mask, 'NoDefaultNextMonth'] = 1
df['NoDefaultNextMonth'] = df['NoDefaultNextMonth'].astype(int)

In [None]:
print_eo_ratio(df, 'NoDefaultNextMonth', 'Age')

In [None]:
plot_distributions_sunburst(df, 'NoDefaultNextMonth', 'Age')

In [None]:
df.to_csv('data/full_clean_credit.csv', index=False)

In [None]:
split_label_sensitive_and_store_data(df,
                                     labels = ['NoDefaultNextMonth'],
                                     sensitives = ['Age', None],
                                     dataset_name = "credit")

## Taiwan Credit

In [None]:
df = pd.read_excel('data/raw/taiwan.xls', header=1, index_col=0)
df.columns = [col.lower() for col in df.columns]
df = df.rename({'default payment next month':'defaultNextMonth'}, axis='columns')
df

In [None]:
# binarize target variable
neg_mask = df['defaultNextMonth'] == 0
df.loc[neg_mask, 'defaultNextMonth'] = -1
df.loc[~neg_mask, 'defaultNextMonth'] = 1
df['defaultNextMonth'] = df['defaultNextMonth'].astype(int)

# binarize sensitive variables
msk = df['sex'] == 2
df.loc[msk, 'sex'] = 0 # female
df.loc[~msk, 'sex'] = 1 # male
df['sex'] = df['sex'].astype(int)

df

In [None]:
print_eo_ratio(df, 'defaultNextMonth', 'sex')

In [None]:
plot_distributions_sunburst(df, 'defaultNextMonth', 'sex')

In [None]:
df.to_csv('data/full_clean_taiwan.csv', index=False)

In [None]:
split_label_sensitive_and_store_data(df,
                                     labels = ['defaultNextMonth'],
                                     sensitives = ['sex', None],
                                     dataset_name = "taiwan")