In [16]:
import numpy as np
import pandas as pd
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split


In [17]:
def load_data():
    column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 
    'marital_status', 'occupation', 'relationship', 'race', 
    'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 
    'native_country', 'income']
    
    train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
    train_data = pd.read_csv(train_url, names=column_names, sep=',', skipinitialspace=True)
    
    test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
    test_data = pd.read_csv(test_url, names=column_names, sep=',', skipinitialspace=True)
    
    return train_data, test_data

In [18]:
train_data, test_data = load_data()

In [19]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [20]:
test_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


We are predicting income based off of race, sex, and education

In [21]:
train_data['income'].value_counts(dropna=False)

<=50K    24720
>50K      7841
Name: income, dtype: int64

In [22]:
test_data['income'].value_counts(dropna=False)

<=50K.    12435
>50K.      3846
NaN           1
Name: income, dtype: int64

In [23]:
train = pd.get_dummies(train_data, columns = ['race', 'sex'], drop_first=True)
train['income'] = train['income'].apply(lambda x: 1 if x == '>50K' else 0)
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country,income,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Male
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,2174,0,40,United-States,0,0,0,0,1,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,0,0,13,United-States,0,0,0,0,1,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,0,0,40,United-States,0,0,0,0,1,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,0,0,40,United-States,0,0,1,0,0,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,0,0,40,Cuba,0,0,1,0,0,0


In [24]:
# Sensitive atribute: sex which is either a 1 or 0 
# Income is what we want to predict - the result is a 1 if the income is greater than 50k and 0 otherwise
# The covariates are: 
# education num - number of years of education
# age - this is done by the decade

columns = ['sex','education_num', 'age']
train = train_data[columns]
train['income'] = train_data['income'].apply(lambda x: 1 if x == '>50K' else 0)
# Replace 'male' with 1 and 'female' with 0
train['sex'] = train['sex'].map({'Male': 0, 'Female': 1})
# Represent age by decade
train['age'] = (train['age'] // 10) * 10

test = test_data[columns]
test.drop(index=0, inplace=True)
test.dropna(inplace=True)

test['age'] = test['age'].astype(int)
test['income'] = test_data['income'].apply(lambda x: 1 if x == '>50K.' else 0)
# Replace 'male' with 1 and 'female' with 0
test['sex'] = test['sex'].map({'Male': 0, 'Female': 1})
# Represent age by decade
test['age'] = (test['age'] // 10) * 10



In [25]:
train.head()

Unnamed: 0,sex,education_num,age,income
0,0,13,30,0
1,0,13,50,0
2,0,9,30,0
3,0,7,50,0
4,1,13,20,0


In [26]:
test.head()

Unnamed: 0,sex,education_num,age,income
1,0,7.0,20,0
2,0,9.0,30,0
3,0,12.0,20,1
4,0,10.0,40,1
5,1,10.0,10,0


In [27]:
train.income.value_counts(), test.income.value_counts()

(0    24720
 1     7841
 Name: income, dtype: int64,
 0    12435
 1     3846
 Name: income, dtype: int64)

In [32]:
# Simulate decision variable - in this case we will determine if an idividual should receive a loan

def loan_decision(row):
    base_prob = 0.3 # baseline of getting a loan
    income_factor = 0.1*row['income']
    education_factor = 0.02*row['education_num']
    
    # Gender bias - making it unfair
    if row['sex'] == 0: # Male
        gender_bias = 0.2 
    else:
        gender_bias = 0
        
    loan_prob =  base_prob + income_factor + education_factor + gender_bias
    loan_prob = min(loan_prob, 1)
    
    return int(np.random.rand() < loan_prob)
    
train['loan_approved'] = train.apply(loan_decision, axis=1)
test['loan_approved'] = test.apply(loan_decision, axis=1)

print(train.head())

   sex  education_num  age  income  loan_approved
0    0             13   30       0              1
1    0             13   50       0              1
2    0              9   30       0              1
3    0              7   50       0              1
4    1             13   20       0              1


In [33]:
train['loan_approved'].value_counts()


1    21548
0    11013
Name: loan_approved, dtype: int64

In [35]:
train[train['loan_approved'] == 1]['sex'].value_counts()

0    16102
1     5446
Name: sex, dtype: int64

In [36]:
#one-hot encode the age variable
train = pd.get_dummies(train, columns=['age'], prefix='age')
test = pd.get_dummies(test, columns=['age'], prefix='age')

In [37]:
train.head()

Unnamed: 0,sex,education_num,income,loan_approved,age_10,age_20,age_30,age_40,age_50,age_60,age_70,age_80,age_90
0,0,13,0,1,0,0,1,0,0,0,0,0,0
1,0,13,0,1,0,0,0,0,1,0,0,0,0
2,0,9,0,1,0,0,1,0,0,0,0,0,0
3,0,7,0,1,0,0,0,0,1,0,0,0,0
4,1,13,0,1,0,1,0,0,0,0,0,0,0


In [16]:
# randomly split into Dlearn and Dtrain, consisting of 14,653 and 14,652 rows; and Dtest and Dvalidate, consisting of 9,768 and 9,769 rows
all_data = pd.concat([train, test], ignore_index=True)
# Split data into Dlearn (14,653) and the remaining
Dlearn, remaining = train_test_split(all_data, train_size=14653, random_state=42)

# Split the remaining into Dtrain (14,652) and Dtest + Dvalidate (remaining rows)
Dtrain, temp = train_test_split(remaining, train_size=14652, random_state=42)

# Split temp into Dtest (9,768) and Dvalidate (9,769)
Dtest, Dvalidate = train_test_split(temp, train_size=9768, random_state=42)

# Verify the sizes
print("Dlearn:", Dlearn.shape)
print("Dtrain:", Dtrain.shape)
print("Dtest:", Dtest.shape)
print("Dvalidate:", Dvalidate.shape)


Dlearn: (14653, 4)
Dtrain: (14652, 4)
Dtest: (9768, 4)
Dvalidate: (9769, 4)


In [17]:
Dtrain.head()

Unnamed: 0,sex,education_num,age,income
32494,1,9.0,80,0
30876,1,9.0,50,0
44872,1,13.0,20,0
14731,0,10.0,30,0
11811,0,9.0,60,0


In [18]:
Dlearn["income"].value_counts(), Dtrain["income"].value_counts(), Dtest["income"].value_counts(), Dvalidate["income"].value_counts()

(income
 0    11179
 1     3474
 Name: count, dtype: int64,
 income
 0    11193
 1     3459
 Name: count, dtype: int64,
 income
 0    7450
 1    2318
 Name: count, dtype: int64,
 income
 0    7333
 1    2436
 Name: count, dtype: int64)

In [19]:
import os
os.makedirs("../data/adult", exist_ok=True)

In [20]:
Dlearn.to_csv("../data/adult/train_basis.csv", index=False)
Dtrain.to_csv("../data/adult/train_betahats.csv", index=False)
Dtest.to_csv("../data/adult/test.csv", index=False)
Dvalidate.to_csv("../data/adult/validate.csv", index=False)