# Preparing the Data

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation as cv
from sklearn import preprocessing
from sklearn import metrics
from sklearn import dummy
from sklearn import linear_model
from sklearn import ensemble
from sklearn import neighbors
from sklearn import svm
from sklearn import tree
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib

Using matplotlib backend: Qt4Agg


In [26]:
data = pd.read_csv('adult.data.csv')

# Move the label + continuous variable as the first few columns
# and move the categorical variables to the end
data = data[['income', 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 
             'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']]
data.head(3)

Unnamed: 0,income,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,<=50K,39,77516,13,2174,0,40,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,<=50K,50,83311,13,0,0,13,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,<=50K,38,215646,9,0,0,40,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 15 columns):
income            32561 non-null object
age               32561 non-null int64
fnlwgt            32561 non-null int64
education-num     32561 non-null int64
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
workclass         32561 non-null object
education         32561 non-null object
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
native-country    32561 non-null object
dtypes: int64(6), object(9)
memory usage: 4.0+ MB


In [28]:
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.366512,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105549.977697,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [30]:
data['income'].unique()

array(['<=50K', '>50K'], dtype=object)

In [31]:
data['income'] = data['income'].apply(lambda x: 1 if x == '>50K' else 0)

In [8]:
sns.pairplot(data, hue='income')

<seaborn.axisgrid.PairGrid at 0x96226a0>

In [9]:
for col in data.dtypes[data.dtypes == object].index:
    plt.figure()
    sns.countplot(y=col, hue='income', data=data)

In [32]:
features = pd.get_dummies(data, columns=data.dtypes[data.dtypes == object].index).drop('income', axis=1)
target = data['income']

In [33]:
features.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'workclass_?', 'workclass_Federal-gov',
       'workclass_Local-gov', 'workclass_Never-worked', 
       ...
       'native-country_Portugal', 'native-country_Puerto-Rico',
       'native-country_Scotland', 'native-country_South',
       'native-country_Taiwan', 'native-country_Thailand',
       'native-country_Trinadad&Tobago', 'native-country_United-States',
       'native-country_Vietnam', 'native-country_Yugoslavia'],
      dtype='object', length=108)

In [35]:
X, y = features.values, target.values

## Transforming the data
The first 6 columns have numerical features, so we'll scale them into the [0,1] range so our algorithms can perform better. 

We don't want to scale the indicator variables generated from our categorical features because many of them are sparse.

In [59]:
# Scale only the continuous features
# The indicator variables shouldn't be 
scaler = preprocessing.StandardScaler()
X[:, :6] = scaler.fit_transform(X[:, :6])

# Spot checking algorithms + setting up the Test Harness

We are going to use the training data given to use from the UCI Machine Learning repository for testing our algorithms with 10-fold cross validation. After checking the data, I found that this is a imbalanced class classification probelm, where the ratio of the negative to positive labels is about 3:1. In order to handle this, instead of using classification accuracy as our metric, we'll be using the F1 score.

Also to systematically evaluate algorithms, we'll be spot checking the following algorithms:


In [83]:
classifiers = [dummy.DummyClassifier()
              ,linear_model.LogisticRegression(penalty='l1', C=0.01)
              ,linear_model.LogisticRegression(penalty='l1', C=0.1)
              ,linear_model.LogisticRegression(penalty='l1', C=1.0) 
              ,linear_model.LogisticRegression(penalty='l2', C=0.01)
              ,linear_model.LogisticRegression(penalty='l2', C=0.1)
              ,linear_model.LogisticRegression(penalty='l2', C=1.0)
              ,linear_model.PassiveAggressiveClassifier(C=0.01)
              ,linear_model.PassiveAggressiveClassifier(C=0.1)
              ,linear_model.PassiveAggressiveClassifier(C=1.0)
              ,linear_model.Perceptron()
              ,linear_model.Perceptron(penalty='l1')
              ,linear_model.Perceptron(penalty='l2')
              ,ensemble.AdaBoostClassifier(n_estimators=100)
              ,ensemble.AdaBoostClassifier(n_estimators=200)
              ,ensemble.AdaBoostClassifier(n_estimators=500)
              ,ensemble.BaggingClassifier(n_estimators=100)
              ,ensemble.BaggingClassifier(n_estimators=200)
              ,ensemble.BaggingClassifier(n_estimators=500)
              ,ensemble.ExtraTreesClassifier(n_estimators=100)
              ,ensemble.ExtraTreesClassifier(n_estimators=200)
              ,ensemble.ExtraTreesClassifier(n_estimators=500)
              ,ensemble.GradientBoostingClassifier(n_estimators=100)
              ,ensemble.GradientBoostingClassifier(n_estimators=200)
              ,ensemble.GradientBoostingClassifier(n_estimators=500)
              ,ensemble.RandomForestClassifier(n_estimators=100)
              ,ensemble.RandomForestClassifier(n_estimators=200)
              ,ensemble.RandomForestClassifier(n_estimators=500)
              ,neighbors.KNeighborsClassifier(n_neighbors=1)
              ,neighbors.KNeighborsClassifier(n_neighbors=5)
              ,neighbors.KNeighborsClassifier(n_neighbors=7)
              ,tree.DecisionTreeClassifier(criterion='gini')
              ,tree.DecisionTreeClassifier(criterion='entropy')
              ,svm.SVC(kernel='linear')
              ,svm.SVC(kernel='poly')
              ,svm.SVC(kernel='rbf')
              ]

In [84]:
results = np.array([cv.cross_val_score(clf, X, y, scoring='f1', cv=10, n_jobs=-1) for clf in classifiers])

In [85]:
print(np.mean(results, axis=1))
print(np.std(results, axis=1))

[ 0.24428439  0.63355121  0.65710311  0.6615626   0.6453196   0.66009681
  0.6613801   0.6458606   0.5791316   0.57065824  0.62108482  0.60954448
  0.58205765  0.69007711  0.69669124  0.70495999  0.67682828  0.67654866
  0.67902136  0.6329108   0.6335421   0.63631023  0.68461011  0.6985837
  0.71184957  0.67304583  0.67206979  0.6734455   0.58631727  0.63241482
  0.63828106  0.62591244  0.62784616  0.65343911  0.21424922  0.65120681]
[ 0.01167388  0.01217495  0.01055939  0.01012239  0.01043794  0.00937203
  0.0096656   0.01972318  0.07168251  0.09601491  0.02389168  0.04859473
  0.03232464  0.01366066  0.01376298  0.01349839  0.01228201  0.0108256
  0.01332323  0.01154303  0.01218588  0.01440477  0.01135943  0.01050196
  0.00998299  0.0135855   0.01301966  0.01370305  0.01130654  0.01231269
  0.01289095  0.01357313  0.01154644  0.01112284  0.01564406  0.01450701]
