# Census Data: Income Prediction
Data Source: https://archive.ics.uci.edu/ml/datasets/Census+Income

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model.logistic import LogisticRegression
from sklearn import metrics

In [2]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
train = pd.read_csv('data/adult.data', header=None)
test = pd.read_csv('data/adult.test', header=None, skiprows=1)
train.columns, test.columns = cols, cols
test['income'] = test['income'].astype(str).str[:-1]
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
X = pd.get_dummies(pd.concat([train.drop('income', axis=1), test.drop('income', axis=1)]))
train_X = X.iloc[:len(train)]
test_X = X.iloc[len(train):]

In [4]:
m = LogisticRegression()
m.fit(train_X, train['income'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [5]:
prediction = m.predict(test_X)

print('Accuracy: ', metrics.accuracy_score(test['income'], prediction))
print('Confusion:\n', metrics.confusion_matrix(test['income'], prediction))

Accuracy:  0.7995823352373933
Confusion:
 [[12020   415]
 [ 2848   998]]


In [6]:
causal_data = test.copy()
causal_data['is_correct'] = causal_data['income'] == prediction

In [7]:
causal_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,is_correct
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,True
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,True
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,False
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,True
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K,True


In [8]:
causal_data['fnlwgt'] = pd.qcut(causal_data['fnlwgt'], 10)
causal_data['education-num'] = pd.cut(causal_data['education-num'], [-np.inf, 2, 5, 8, 11, 14, np.inf])
causal_data['capital-gain'] = pd.cut(causal_data['capital-gain'], [-np.inf, 0, 500, 1000, 5000, 10000, 25000, np.inf])
causal_data['capital-loss'] = pd.cut(causal_data['capital-loss'], [-np.inf, -50, 0, 100, 1000, np.inf])
causal_data['hours-per-week'] = pd.cut(causal_data['hours-per-week'], 5)
causal_data['age'] = pd.cut(causal_data['age'], 15)

In [9]:
causal_data['income'] = causal_data['income'].astype(str).str.replace('=','')
causal_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,is_correct
0,"(21.867, 26.733]",Private,"(221166.0, 260761.0]",11th,"(5.0, 8.0]",Never-married,Machine-op-inspct,Own-child,Black,Male,"(-inf, 0.0]","(-50.0, 0.0]","(20.6, 40.2]",United-States,<50K,True
1,"(36.467, 41.333]",Private,"(66095.0, 105381.0]",HS-grad,"(8.0, 11.0]",Married-civ-spouse,Farming-fishing,Husband,White,Male,"(-inf, 0.0]","(-50.0, 0.0]","(40.2, 59.8]",United-States,<50K,True
2,"(26.733, 31.6]",Local-gov,"(327203.0, 1490400.0]",Assoc-acdm,"(11.0, 14.0]",Married-civ-spouse,Protective-serv,Husband,White,Male,"(-inf, 0.0]","(-50.0, 0.0]","(20.6, 40.2]",United-States,>50K,False
3,"(41.333, 46.2]",Private,"(156916.0, 177831.0]",Some-college,"(8.0, 11.0]",Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,"(5000.0, 10000.0]","(-50.0, 0.0]","(20.6, 40.2]",United-States,>50K,True
4,"(16.927, 21.867]",?,"(66095.0, 105381.0]",Some-college,"(8.0, 11.0]",Never-married,?,Own-child,White,Female,"(-inf, 0.0]","(-50.0, 0.0]","(20.6, 40.2]",United-States,<50K,True


In [10]:
feature_vector =   'a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:'
structure_vector = '0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:'
max_dims =         '1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:'

assert(len(feature_vector.split(':')) == len(causal_data.columns))

In [11]:
error_rate = 1 - (causal_data['is_correct'].sum() / causal_data.shape[0])
cost = 100.0
error_rate

0.2004176647626067

In [12]:
top_row = feature_vector + '\t' + max_dims + ';' + str(error_rate) + ';' + str(cost) + ';false;' + feature_vector + ';' + structure_vector + ';' + str(causal_data.shape[0]) + ';0;'
top_row

'a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:\t1:1:1:1:1:1:1:1:1:1:1:1:1:1:1:;0.2004176647626067;100.0;false;a:a:a:a:a:a:a:a:a:a:a:a:a:a:a:;0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:;16281;0;'

In [13]:
truth_vals = causal_data.loc[:, 'is_correct']
entries = causal_data.loc[:, causal_data.columns != 'is_correct'].astype(str)

In [14]:
causal_data['input-str'] = np.sum('a--///--'+entries+'--//--', axis=1)
causal_data['input-str'] = causal_data['input-str'].str.replace(':', '/COLON/').str.replace(';','/SEMICOLON/').str.replace('=', '/EQ/').str.replace('%', '/PERCENT/').str.replace('_','/UNDERSCORE/').str.replace('--//--',':').str.replace('--///--','_')

In [15]:
with open('./data/census-predicted-input.txt', 'w') as f:
    f.write(top_row)
    list(f.write('{}%{}%{}='.format(i, truth_vals.iloc[i], causal_data['input-str'].iloc[i])) for i in range(len(causal_data)))