In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
sns.set()


def read_clean_data():
    train = pd.read_csv('./data/adult.data', index_col=None, header=None)
    test = pd.read_csv('./data/adult.test', index_col=None, header=None)
    columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
               'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
               'income']
    train.columns = columns
    test.columns = columns
    # drop rows with nan values
    train.replace({' ?': np.nan}, inplace=True)
    train.dropna(inplace=True)
    train.head()
    test.replace({' ?': np.nan}, inplace=True)
    test.dropna(inplace=True)

    # clean target strings from test set
    test.replace({
        ' <=50K.': ' <=50K',
        ' >50K.': ' >50K'
    }, inplace=True)

    # save cleaned versions
    train.to_csv('./data/cleaned_train.csv')
    test.to_csv('./data/cleaned_test.csv')


train = pd.read_csv('./data/cleaned_train.csv', index_col=0)
test = pd.read_csv('./data/cleaned_test.csv', index_col=0)
print('Train', train.shape)
print('Test', test.shape)

Train (30162, 15)
Test (15060, 15)


In [2]:
# categorical variables number of unique values
for c in train.columns:
    if train[c].dtype != 'object':
        continue
    vals = train[c].unique()
    print('%s: %d values' % (c, len(vals)))
    print(vals, '\n')

workclass: 7 values
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay'] 

education: 16 values
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' 7th-8th' ' Doctorate' ' Assoc-voc' ' Prof-school'
 ' 5th-6th' ' 10th' ' Preschool' ' 12th' ' 1st-4th'] 

marital-status: 7 values
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] 

occupation: 14 values
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Transport-moving' ' Farming-fishing'
 ' Machine-op-inspct' ' Tech-support' ' Craft-repair' ' Protective-serv'
 ' Armed-Forces' ' Priv-house-serv'] 

relationship: 6 values
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative'] 

race: 5 values
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other'] 

sex: 2 values
[' Male' ' Female'] 

na

In [None]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
lenc = LabelEncoder()

y_train = pd.DataFrame(lenc.fit_transform(train['income']))
y_test = pd.DataFrame(lenc.transform(test['income']))
y_train.index = train.index
y_test.index = test.index
X_train = train.drop(['income'], axis=1)
X_test = test.drop(['income'], axis=1)

numeric_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]
obj_cols = [cname for cname in X_train.columns if X_train[cname].dtype == 'object']
oh_cols = pd.DataFrame(enc.fit_transform(X_train[obj_cols]))
oh_cols_test = pd.DataFrame(enc.transform(X_test[obj_cols]))
oh_cols.index = X_train.index
oh_cols_test.index = X_test.index
X_train = pd.concat([X_train.drop(obj_cols, axis=1), oh_cols], axis=1)
X_test = pd.concat([X_test.drop(obj_cols, axis=1), oh_cols_test], axis=1)

In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(cv=5, random_state=0, Cs=10).fit(X_train, y_train)
print('Training score:', clf.score(X_train, y_train))
print('Test score:', clf.score(X_test, y_test))

In [None]:
df = pd.DataFrame({
    'Income': ['<=50K', '>50K'],
    'Training': list(train['income'].value_counts()),
    'Test': list(test['income'].value_counts())
})
fig, ax = plt.subplots(1)
df.plot(x='Income', y=['Training', 'Test'], kind='bar', ax=ax)
plt.xticks(rotation=0)
fig.savefig('./output/class-dist.png')

In [3]:
def row_to_dict(X, y=None):
    return X.apply(dict, axis=1)


# define prediction model
ft = FunctionTransformer(row_to_dict, validate=False)
dv = DictVectorizer()
rf = RandomForestClassifier()

# glue steps together
model = make_pipeline(ft, dv, rf)
y = train['income']
model.fit(train.drop(['income'], axis=1), y)
print('Training score:', model.score(train.drop(['income'], axis=1), train['income']))
print('Test score:', model.score(test.drop(['income'], axis=1), test['income']))

# get feature importances
feature_importances = list(zip(dv.feature_names_, rf.feature_importances_))

Training score: 0.9999336913997745
Test score: 0.8461487383798141


In [None]:
# aggregate categorical variables' importance
res = dict()
for x in feature_importances:
    n, v = x
    ind = n.find('=')
    if ind > -1:
        n2 = n[:ind]
        if n2 in res:
            res[n2] += v
        else:
            res[n2] = v
    else:
        res[n] = v
# plot feature importance
keyys = res.keys()
fig, ax = plt.subplots(1)
ax.bar(keyys, [res[x] for x in keyys])
plt.xticks(rotation=90)
fig.savefig('./output/weights.png')

