# Census Data: Income Prediction
Data Source: https://archive.ics.uci.edu/ml/datasets/Census+Income

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import random

random.seed(0)
np.random.seed(0)

In [2]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
train = pd.read_csv('data/adult.data', header=None, skipinitialspace=True)
test = pd.read_csv('data/adult.test', header=None, skiprows=1, skipinitialspace=True)
train.columns, test.columns = cols, cols
test['income'] = test['income'].astype(str).str[:-1]
train.drop('fnlwgt', axis=1, inplace=True), test.drop('fnlwgt', axis=1, inplace=True)  # should not be used for prediction
train.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(train[categorical_features])

train_X = pd.concat([train.drop(categorical_features, 1),
          pd.DataFrame(encoder.transform(train[categorical_features]))], axis=1).reindex().drop('income', axis=1)
test_X = pd.concat([test.drop(categorical_features, 1),
          pd.DataFrame(encoder.transform(test[categorical_features]))], axis=1).reindex().drop('income', axis=1)
train_X.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,0,1,2,3,4,...,92,93,94,95,96,97,98,99,100,101
0,39,13,2174,0,40,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,50,13,0,0,13,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,38,9,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,53,7,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,28,13,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
m = LogisticRegression()
m.fit(train_X, train['income'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [5]:
prediction = m.predict(test_X)

print('Accuracy: ', metrics.accuracy_score(test['income'], prediction))
print('Confusion:\n', metrics.confusion_matrix(test['income'], prediction))

Accuracy:  0.8516061666973773
Confusion:
 [[11596   839]
 [ 1577  2269]]


In [6]:
causal_data = test.copy()
causal_data['is_correct'] = causal_data['income'] == prediction

In [7]:
causal_data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,is_correct
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,True
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,True
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,False
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,True
4,18,?,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K,True


In [8]:
causal_data['education-num'] = pd.cut(causal_data['education-num'], [-np.inf, 2, 5, 8, 11, 14, np.inf])
causal_data['capital-gain'] = pd.cut(causal_data['capital-gain'], [-np.inf, 0, 500, 1000, 5000, 10000, 25000, np.inf])
causal_data['capital-loss'] = pd.cut(causal_data['capital-loss'], [-np.inf, -50, 0, 100, 1000, np.inf])
causal_data['hours-per-week'] = pd.cut(causal_data['hours-per-week'], 5)
causal_data['age'] = pd.cut(causal_data['age'], 15)

In [9]:
causal_data['income'] = causal_data['income'].astype(str).str.replace('=','')
causal_data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,is_correct
0,"(21.867, 26.733]",Private,11th,"(5.0, 8.0]",Never-married,Machine-op-inspct,Own-child,Black,Male,"(-inf, 0.0]","(-50.0, 0.0]","(20.6, 40.2]",United-States,<50K,True
1,"(36.467, 41.333]",Private,HS-grad,"(8.0, 11.0]",Married-civ-spouse,Farming-fishing,Husband,White,Male,"(-inf, 0.0]","(-50.0, 0.0]","(40.2, 59.8]",United-States,<50K,True
2,"(26.733, 31.6]",Local-gov,Assoc-acdm,"(11.0, 14.0]",Married-civ-spouse,Protective-serv,Husband,White,Male,"(-inf, 0.0]","(-50.0, 0.0]","(20.6, 40.2]",United-States,>50K,False
3,"(41.333, 46.2]",Private,Some-college,"(8.0, 11.0]",Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,"(5000.0, 10000.0]","(-50.0, 0.0]","(20.6, 40.2]",United-States,>50K,True
4,"(16.927, 21.867]",?,Some-college,"(8.0, 11.0]",Never-married,?,Own-child,White,Female,"(-inf, 0.0]","(-50.0, 0.0]","(20.6, 40.2]",United-States,<50K,True


In [10]:
feature_vector =   'a:a:a:a:a:a:a:a:a:a:a:a:a:a:'
structure_vector = '0:0:0:0:0:0:0:0:0:0:0:0:0:0:'
max_dims =         '1:1:1:1:1:1:1:1:1:1:1:1:1:1:'

assert(len(feature_vector.split(':')) == len(causal_data.columns))

In [11]:
error_rate = 1 - (causal_data['is_correct'].sum() / causal_data.shape[0])
cost = 100.0
error_rate

0.14839383330262268

In [12]:
top_row = feature_vector + '\t' + max_dims + ';' + str(error_rate) + ';' + str(cost) + ';false;' + feature_vector + ';' + structure_vector + ';' + str(causal_data.shape[0]) + ';0;'
top_row

'a:a:a:a:a:a:a:a:a:a:a:a:a:a:\t1:1:1:1:1:1:1:1:1:1:1:1:1:1:;0.14839383330262268;100.0;false;a:a:a:a:a:a:a:a:a:a:a:a:a:a:;0:0:0:0:0:0:0:0:0:0:0:0:0:0:;16281;0;'

In [13]:
truth_vals = causal_data.loc[:, 'is_correct']
entries = causal_data.loc[:, causal_data.columns != 'is_correct'].astype(str)

In [14]:
causal_data['input-str'] = np.sum('a--///--'+entries+'--//--', axis=1)
causal_data['input-str'] = causal_data['input-str'].str.replace(' ','').str.replace(':', '/COLON/').str.replace(';','/SEMICOLON/').str.replace('=', '/EQ/').str.replace('%', '/PERCENT/').str.replace('_','/UNDERSCORE/').str.replace('--//--',':').str.replace('--///--','_')

In [15]:
with open('./data/census-predicted-input.txt', 'w') as f:
    f.write(top_row)
    list(f.write('{}%{}%{}='.format(i, truth_vals.iloc[i], causal_data['input-str'].iloc[i])) for i in range(len(causal_data)))

In [16]:
causal_data[(causal_data['race']=='Black') & (causal_data['income'] == '<50K')].shape[0]/causal_data[(causal_data['race']=='Black')].shape[0],\
causal_data[(causal_data['race']=='White') & (causal_data['income'] == '<50K')].shape[0]/causal_data[(causal_data['race']=='White')].shape[0]

(0.8853299167200512, 0.7497490319804962)

### To tell if the model is biased, we generate random "test" data with race == Black and race == White. Next, we can use causal analysis to check if race == Black causes income <50K to be more likely.
Note that we can not just see if the model makes more errors on race == Black, because the model can perform great on these tuples if the data is biased, too.

In [17]:
column_ranges = []
for col in train.drop('income', axis=1).columns:
    column_ranges.append(train[col].unique())
column_ranges[7] = ['Black', 'White']

In [18]:
random_test = pd.DataFrame(index=range(10000), columns=test.drop('income', axis=1).columns)
random_test = random_test.apply(lambda row: pd.Series([random.choice(l) for l in column_ranges], index=row.index), axis=1)
random_test.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,54,Local-gov,9th,14,Separated,Protective-serv,Wife,Black,Female,7298,2179,91,Taiwan
1,72,Never-worked,Masters,3,Married-spouse-absent,Sales,Unmarried,Black,Female,3781,1735,10,India
2,85,State-gov,11th,6,Married-AF-spouse,Armed-Forces,Other-relative,Black,Female,7443,2174,65,Laos
3,31,Federal-gov,Assoc-voc,11,Widowed,Handlers-cleaners,Unmarried,White,Male,4064,419,84,Peru
4,40,Local-gov,Doctorate,14,Separated,Sales,Unmarried,Black,Female,3781,1980,87,Dominican-Republic


In [19]:
random_X = pd.concat([random_test.drop(categorical_features, 1),
                      pd.DataFrame(encoder.transform(random_test[categorical_features]))], axis=1).reindex()
random_X.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,0,1,2,3,4,...,92,93,94,95,96,97,98,99,100,101
0,54,14,7298,2179,91,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,72,3,3781,1735,10,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,85,6,7443,2174,65,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,31,11,4064,419,84,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40,14,3781,1980,87,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
random_test['prediction'] = m.predict(random_X)
random_test.groupby('race')['prediction'].value_counts(normalize=True)

race   prediction
Black  >50K          0.544030
       <=50K         0.455970
White  >50K          0.576442
       <=50K         0.423558
Name: prediction, dtype: float64

In [21]:
random_test.to_csv('data/census-random-prediction.csv')