In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from data import train_val_test_split_dataframe, split_save_data

data_kwargs = {
    'y_col': 'label',
    'train_size': 0.6,
    'val_size': 0.2,
    'test_size': 0.2,
    'normalize': True,
    'random_state': 42,
}

# COMPAS dataset

In [5]:
def load_compas(
    path='./dataset/compas/RecidivismData_Normalized.csv',
    response='two_year_recid',
    sensitive='race',
):
    df = pd.read_csv(path)

    df = df[(df['race'] == 2) | (df['race'] == 3)]  # African-American = 2
    df['race'] = df['race'] - 2  # African-American = 0.0, White = 1.0

    df['sex'] = 1 - df['sex'] # Female = 0.0, Male = 1.0

    attr = [
        'MarriageStatus',
        'age',
        'sex',
        'race',
        'juv_fel_count',
        'juv_misd_count',
        'juv_other_count',
        'priors_count',
        'days_b_screening_arrest',
        'c_days_from_compas',
        'c_charge_degree',
    ]
    df['label'] = df[response]
    df = df[attr + ['label']]
    return df

df = load_compas()
df.describe()

# df.to_csv('./data/compas.csv', index=False)

Unnamed: 0,MarriageStatus,age,sex,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,c_charge_degree,label
count,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0,5875.0
mean,1.546553,34.813787,0.802383,0.399489,0.003574,0.007555,0.007049,0.097097,0.233525,0.003703,0.653277,0.465872
std,1.436075,11.981488,0.398236,0.489835,0.022046,0.038746,0.030888,0.132179,0.25106,0.060169,0.475967,0.498876
min,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0
25%,1.0,25.0,1.0,0.0,0.0,0.0,0.0,0.0,0.280761,0.000105,0.0,0.0
50%,1.0,31.0,1.0,0.0,0.0,0.0,0.0,0.052632,0.280761,0.000105,1.0,0.0
75%,1.0,43.0,1.0,1.0,0.0,0.0,0.0,0.131579,0.280761,0.000211,1.0,1.0
max,7.0,83.0,1.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
# split_save_data(df, df.columns, 'data/compas/race', s_col='race', **data_kwargs)
# split_save_data(df, df.columns, 'data/compas/sex', s_col='sex', **data_kwargs)

# Adult dataset

In [40]:
def load_adult(
    path_prefix='./dataset/adult/IBM_adult',
    response='Income',
    sensitive='Gender',
    val=True,
    return_df=False,
):
    df = pd.read_csv(path_prefix + '_X.txt', sep='\t', index_col=0)
    df['gender'] = pd.read_csv(
        path_prefix + '_A.txt', sep='\t', index_col=0, header=None
    )  # Male = 1
    df['label'] = pd.read_csv(path_prefix + '_Y.txt', sep='\t', index_col=0, header=None)

    return df

df = load_adult()
df.describe()

df.to_csv('./data/adult.csv', index=False)

In [7]:
# split_save_data(df, df.columns, 'data/adult/race', s_col='race', **data_kwargs)
# split_save_data(df, df.columns, 'data/adult/gender', s_col='gender', **data_kwargs)

# split_save_data(df, ['capital-gain', 'age', 'gender', 'label'], 'data/adult/sub/gender', s_col='gender', **data_kwargs)

# LSA dataset

In [2]:
# columns: GPA	Race	resident	college	Year	Gender	admit	Black	Hispanic	Asian	White	MissingRace	URM	enroll

lsa_cols = ['LSAT', 'GPA', 'race', 'resident', 'college', 'Year', 'gender', 'label', 'Black', 'Hispanic', 'Asian', 'White', 'MissingRace', 'URM', 'enroll']

# df = pd.read_excel('dataset/LSA.xls', header=None)[[0, 1, 2, 6, 7]]
df = pd.read_excel('dataset/LSA.xls', header=None) \
    .rename(columns=dict(zip(np.arange(0, len(lsa_cols)), lsa_cols))) \
    .drop(columns=['college', 'Year', 'MissingRace', 'URM', 'enroll'])

df['race'] = df['race'].map({
    'White': 0,
    'Asian': 1,
    'Black': 1,
    'Hispanic': 1,
})

df = df[df.race != ' '].dropna()
df.describe()

df.to_csv('./data/lsa.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/LSA.xls'

In [33]:
df = pd.read_csv('data/lsa.csv')
y = df['label'].to_numpy()
s = df['race'].to_numpy()
X = df[['LSAT', 'GPA', 'race', 'resident', 'gender', 'Black', 'Hispanic', 'Asian', 'White']]
# df.columns

print (f"White {df['White'].sum()}, pos_ratio {df[df['White'] == 1]['label'].mean()}")
print (f"Black {df['Black'].sum()}, pos_ratio {df[df['Black'] == 1]['label'].mean()}")
print (f"Hispanic {df['Hispanic'].sum()}, pos_ratio {df[df['Hispanic'] == 1]['label'].mean()}")
print (f"Asian {df['Asian'].sum()}, pos_ratio {df[df['Asian'] == 1]['label'].mean()}")

white_black_mask = (df['White'].values == 1) | (df['Black'].values == 1)
df[white_black_mask].to_csv('data/lsa_white_black.csv', index=False)

White 40989, pos_ratio 0.2962746102612896
Black 4621, pos_ratio 0.23198441895693572
Hispanic 3536, pos_ratio 0.21917420814479638
Asian 6302, pos_ratio 0.2365915582354808


In [53]:
pd.read_csv('data/lsa.csv').head()

df = pd.read_csv('data/lsa.csv')
y = df['label'].to_numpy()
s = df['race'].to_numpy()
X = df[['LSAT', 'GPA', 'race', 'resident', 'gender', 'Black', 'Hispanic', 'Asian', 'White']]
# df.columns

print (f"Gender=0 {(df['gender']==0).sum()} pos_ratio = {y[df['gender'].values==0].mean()}")
print (f"Gender=1 {(df['gender']==1).sum()} pos_ratio = {y[df['gender'].values==1].mean()}")

print (f"White {(s==0).sum()}, pos_ratio = {(y[s==0]==1).sum()/(s==0).sum()}")
print (f"Other {(s==1).sum()}, pos_ratio = {(y[s==1]==1).sum()/(s==1).sum()}")

from utils import mask_to_idx, idx_to_mask

idx_drop_race_pos = mask_to_idx((df['label'].values == 1) & (df['race'].values == 1))
idx_drop_sex_pos = mask_to_idx((df['label'].values == 1) & (df['gender'].values == 1))
print(len(idx_drop_race_pos), len(idx_drop_sex_pos))

np.random.seed(0)
idx_drop_race_pos = np.random.choice(idx_drop_race_pos, size=int(len(idx_drop_race_pos)*0.5), replace=False)
idx_drop_sex_pos = np.random.choice(idx_drop_sex_pos, size=int(len(idx_drop_sex_pos)*0.5), replace=False)
print(len(idx_drop_race_pos), len(idx_drop_sex_pos), len(idx_drop_race_pos) + len(idx_drop_sex_pos))

idx_drop = np.unique(np.concatenate([idx_drop_race_pos, idx_drop_sex_pos]))
msk_drop = idx_to_mask(idx_drop, len(y))
print (msk_drop.sum())

# idx_prot_pos = mask_to_idx((df['label'].values == 1) & (df['race'].values == 1) & (df['gender'].values == 1))
# print (len(idx_prot_pos))

# np.random.seed(0)
# idx_prot_pos_drop = np.random.choice(idx_prot_pos, size=int(len(idx_prot_pos)*1), replace=False)
# msk_drop = idx_to_mask(idx_prot_pos_drop, len(y))

df[~msk_drop].to_csv('data/lsa_unfair_gender_race.csv', index=False)

df = pd.read_csv('data/lsa_unfair_gender_race.csv')
y = df['label'].to_numpy()
s = df['race'].to_numpy()

print (f"Gender=0 {(df['gender']==0).sum()} pos_ratio = {y[df['gender'].values==0].mean()}")
print (f"Gender=1 {(df['gender']==1).sum()} pos_ratio = {y[df['gender'].values==1].mean()}")
print (f"White {(s==0).sum()}, pos_ratio = {(y[s==0]==1).sum()/(s==0).sum()}")
print (f"Other {(s==1).sum()}, pos_ratio = {(y[s==1]==1).sum()/(s==1).sum()}")

Gender=0 24604 pos_ratio = 0.28097057389042435
Gender=1 30844 pos_ratio = 0.27781740370898717
White 40989, pos_ratio = 0.2962746102612896
Other 14459, pos_ratio = 0.23085967217649905
3338 8569
1669 4284 5953
5548
Gender=0 23717 pos_ratio = 0.2540793523632837
Gender=1 26183 pos_ratio = 0.14925715158690753
White 37545, pos_ratio = 0.2317219336795845
Other 12355, pos_ratio = 0.09987859166329421


In [35]:
from data import FairDataset

# dataname = 'adult'
# dataname = 'compas'
dataname = 'lsa'

adult = FairDataset(
    dataname=dataname,
    csv_path=f'./data/{dataname}.csv',
    s_col='gender',
    y_col='label',
    normalize=True,
    random_state=42,
    x_with_s=False,
)

adult.describe()
adult.brief()

# (X_train, y_train, s_train), (X_val, y_val, s_val), (X_test, y_test, s_test) = adult.data

Dataset    : lsa (55448, 10) load from ./data/lsa.csv
Sens/Res   : gender/label
Split      : train/val/test = 0.6/0.2/0.2, random_state = 42, x_with_s = False
train data [#samples 33268 #features 8]:
+-----+-------+-------+------------+
|     |   y=0 |   y=1 |   pos_rate |
| s=0 | 13365 |  5141 |     0.2778 |
+-----+-------+-------+------------+
| s=1 | 10615 |  4147 |     0.2809 |
+-----+-------+-------+------------+
val data [#samples 11090 #features 8]:
+-----+-------+-------+------------+
|     |   y=0 |   y=1 |   pos_rate |
| s=0 |  4455 |  1714 |     0.2778 |
+-----+-------+-------+------------+
| s=1 |  3538 |  1383 |     0.281  |
+-----+-------+-------+------------+
test data [#samples 11090 #features 8]:
+-----+-------+-------+------------+
|     |   y=0 |   y=1 |   pos_rate |
| s=0 |  4455 |  1714 |     0.2778 |
+-----+-------+-------+------------+
| s=1 |  3538 |  1383 |     0.281  |
+-----+-------+-------+------------+

Dataset    : lsa (55448, 10) load from ./data/lsa.csv


# Bank Dataset

In [None]:
def load_compas(
    path='./dataset/compas/RecidivismData_Normalized.csv',
    response='two_year_recid',
    sensitive='race',
):
    df = pd.read_csv(path)

    df = df[(df['race'] == 2) | (df['race'] == 3)]  # African-American = 2
    df['race'] = df['race'] - 2  # African-American = 0.0, White = 1.0

    df['sex'] = 1 - df['sex'] # Female = 0.0, Male = 1.0

    attr = [
        'MarriageStatus',
        'age',
        'sex',
        'race',
        'juv_fel_count',
        'juv_misd_count',
        'juv_other_count',
        'priors_count',
        'days_b_screening_arrest',
        'c_days_from_compas',
        'c_charge_degree',
    ]
    df['label'] = df[response]
    df = df[attr + ['label']]
    return df

df = load_compas()
df.describe()

# df.to_csv('./data/compas.csv', index=False)

In [None]:
# split_save_data(df, df.columns, 'data/compas/race', s_col='race', **data_kwargs)
# split_save_data(df, df.columns, 'data/compas/sex', s_col='sex', **data_kwargs)

# German Dataset