 ## Adult dataset
 Does someone earn over 50k?
 - http://archive.ics.uci.edu/ml/datasets/Adult

In [127]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [128]:
data_filename = "data/adult_data.csv"
cols=["Age", "Work-Class", "fnlwgt","Occupation","Capital-loss","Education", "Education-Num", "Marital-Status",
       "Relationship", "Race", "Sex", "Capital-gain","Hours-per-week", "Native-Country", "Earnings-Raw"]
df = pd.read_csv(data_filename,header=None,names=cols)

In [129]:
df.head()

Unnamed: 0,Age,Work-Class,fnlwgt,Occupation,Capital-loss,Education,Education-Num,Marital-Status,Relationship,Race,Sex,Capital-gain,Hours-per-week,Native-Country,Earnings-Raw
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [130]:
df.tail()

Unnamed: 0,Age,Work-Class,fnlwgt,Occupation,Capital-loss,Education,Education-Num,Marital-Status,Relationship,Race,Sex,Capital-gain,Hours-per-week,Native-Country,Earnings-Raw
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [131]:
#drop empty data rows
df.dropna(how='all',inplace=True)

In [132]:
df.columns

Index(['Age', 'Work-Class', 'fnlwgt', 'Occupation', 'Capital-loss',
       'Education', 'Education-Num', 'Marital-Status', 'Relationship', 'Race',
       'Sex', 'Capital-gain', 'Hours-per-week', 'Native-Country',
       'Earnings-Raw'],
      dtype='object')

In [133]:
#look at some continuous ordinal features
df["Hours-per-week"].describe()

count    32561.000000
mean        40.437456
std         12.347429
min          1.000000
25%         40.000000
50%         40.000000
75%         45.000000
max         99.000000
Name: Hours-per-week, dtype: float64

In [134]:
#look at categorical
df["Work-Class"].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'], dtype=object)

In [135]:
#plt.figure(figsize=(12, 9))
#sns.swarmplot(x="Education-Num", y="Hours-per-week", hue="Earnings-Raw", data=df[::50], size=12)

In [136]:
#create column for long hours
df["LongHours"] = df["Hours-per-week"] > 40

In [137]:
df.head()

Unnamed: 0,Age,Work-Class,fnlwgt,Occupation,Capital-loss,Education,Education-Num,Marital-Status,Relationship,Race,Sex,Capital-gain,Hours-per-week,Native-Country,Earnings-Raw,LongHours
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,False
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,False
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,False
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,False
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,False


In [138]:
#Variance Testing
import numpy as np
X = np.arange(30).reshape((10,3))

In [139]:
#synthetic dataset with 10 samples and 3 features
X
X.shape

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14],
       [15, 16, 17],
       [18, 19, 20],
       [21, 22, 23],
       [24, 25, 26],
       [27, 28, 29]])

(10, 3)

In [140]:
#set second column to 1
X[:,1] = 1
X

array([[ 0,  1,  2],
       [ 3,  1,  5],
       [ 6,  1,  8],
       [ 9,  1, 11],
       [12,  1, 14],
       [15,  1, 17],
       [18,  1, 20],
       [21,  1, 23],
       [24,  1, 26],
       [27,  1, 29]])

In [142]:
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold()
Xt = vt.fit_transform(X)
#second column removed as under deafult variance threshold
Xt

array([[ 0,  2],
       [ 3,  5],
       [ 6,  8],
       [ 9, 11],
       [12, 14],
       [15, 17],
       [18, 20],
       [21, 23],
       [24, 26],
       [27, 29]])

In [143]:
vt.variances_

array([ 74.25,   0.  ,  74.25])

In [184]:
#feature extraction, need ed values as int for feature extraction
from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder();
encoding.fit(df["Education-Num"].values);
ed_values = encoding.transform(df["Education-Num"].values);
ed_values = ed_values.reshape(ed_values.shape[0],1);

In [185]:
X = df[["Age","Capital-gain", "Capital-loss", "Hours-per-week"]].values
y = (df["Earnings-Raw"] == ' >50K').values
X.shape
ed_values.shape

(32561, 4)

(32561, 1)

In [186]:
#conat the horizontally
X = np.hstack([X,ed_values])

In [190]:
#try univarate feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
transformer = SelectKBest(score_func=chi2, k=3)
Xt_chi2 = transformer.fit_transform(X, y)
cols = ["Age","Capital-gain", "Capital-loss", "Hours-per-week","Education-Num"]
#print(transformer.scores_)
print(list(zip(cols,transformer.scores_)))

[('Age', 8600.6118215555798), ('Capital-gain', 1372145.890201465), ('Capital-loss', 2401.4217771976464), ('Hours-per-week', 6476.4089959321245), ('Education-Num', 504.55885380328937)]


In [197]:
#try tree selection
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier();
ET = model.fit(X,y);
cols = ["Age","Capital-gain", "Capital-loss", "Hours-per-week","Education-Num"];
#print(transformer.scores_)
print(list(zip(cols,ET.feature_importances_)))


[('Age', 0.36857901506092289), ('Capital-gain', 0.064470890211430751), ('Capital-loss', 0.20062096252496769), ('Hours-per-week', 0.22672647864850043), ('Education-Num', 0.13960265355417817)]


In [199]:
Xt_chi2

array([[39,  0, 40],
       [50,  0, 13],
       [38,  0, 40],
       ..., 
       [58,  0, 40],
       [22,  0, 20],
       [52,  0, 40]])

In [201]:
ET

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [206]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
clf = DecisionTreeClassifier(random_state=7)
scores_chi2 = cross_val_score(clf,Xt_chi2,y,scoring='accuracy')
"Chi score results using top three features: {:.3f}".format(scores_chi2.mean())

'Chi score results using top three features: 0.774'