In [1]:
import numpy as np
import pandas as pd

In [2]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', skiprows = 1, header = None)

In [3]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status','occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week','native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

In [4]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [5]:
train_set.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
train_set['train_ind'] = 1

In [7]:
test_set['train_ind'] = 0

In [8]:
combined_data = train_set.append(test_set)

In [45]:
combined_data.shape

(48842, 16)

In [10]:
combined_data.describe(include = ['O']).columns

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'wage_class'],
      dtype='object')

In [46]:
df1 = combined_data.replace(' ?', np.nan)

In [47]:
df1.isnull().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     857
wage_class           0
train_ind            0
dtype: int64

In [48]:
df1.fillna(' unknown', inplace = True)

In [49]:
df1.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage_class        0
train_ind         0
dtype: int64

In [50]:
df1['wage_class'].unique()

array([' <=50K', ' >50K', ' <=50K.', ' >50K.'], dtype=object)

In [51]:
df1['target_variable'] = 0

In [52]:
df1.loc[df1['wage_class'] == ' >50K' ,'target_variable'] = 1

In [53]:
df1.loc[df1['wage_class'] == ' >50K.' ,'target_variable'] = 1

In [54]:
df1['target_variable'].value_counts()

0    37155
1    11687
Name: target_variable, dtype: int64

In [55]:
df1.shape

(48842, 17)

In [21]:
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class,train_ind,target_variable
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,1,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,1,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,1,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,1,0


In [56]:
df1['relationship'].unique()

array([' Not-in-family', ' Husband', ' Wife', ' Own-child', ' Unmarried',
       ' Other-relative'], dtype=object)

In [57]:
dummies = pd.get_dummies(df1['relationship'], prefix = 'relationship')
#df1 = df1.join(dummies)

In [59]:
df1 = pd.concat([df1,dummies],axis = 1)

In [60]:
df1.shape

(48842, 23)

In [58]:
dummies.shape

(48842, 6)

In [66]:
df1.drop('relationship', axis = 1, inplace = True)

In [67]:
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,race,sex,capital_gain,...,native_country,wage_class,train_ind,target_variable,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,White,Male,2174,...,United-States,<=50K,1,0,0,1,0,0,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,White,Male,0,...,United-States,<=50K,1,0,1,0,0,0,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,White,Male,0,...,United-States,<=50K,1,0,0,1,0,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Black,Male,0,...,United-States,<=50K,1,0,1,0,0,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Black,Female,0,...,Cuba,<=50K,1,0,0,0,0,0,0,1


In [68]:
df1.shape

(48842, 22)

In [69]:
df1.describe(include = ['O']).columns

Index(['workclass', 'education', 'marital_status', 'occupation', 'race', 'sex',
       'native_country', 'wage_class'],
      dtype='object')

In [70]:
dummies_workclass = pd.get_dummies(df1['workclass'], prefix = 'workclass')

In [71]:
dummies_education = pd.get_dummies(df1['education'], prefix = 'education')

In [72]:
dummies_marital_status = pd.get_dummies(df1['marital_status'], prefix = 'marital_status')

In [73]:
dummies_occupation = pd.get_dummies(df1['occupation'], prefix = 'occupation')

In [74]:
dummies_race = pd.get_dummies(df1['race'], prefix = 'race')

In [75]:
dummies_sex = pd.get_dummies(df1['sex'], prefix = 'sex')

In [76]:
df1 = pd.concat([df1,dummies_workclass,dummies_education,
                 dummies_marital_status,dummies_occupation,dummies_race,dummies_sex],axis = 1)

In [77]:
df1.drop(['workclass', 'education', 'marital_status', 'occupation', 'race', 'sex',
          'wage_class'], axis = 1, inplace = True)

In [80]:
df1.shape

(48842, 70)

In [79]:
df1['country']= 0

In [81]:
df1.loc[df1['native_country'] == ' United-States' ,'country'] = 1

In [82]:
df1['country'].value_counts()

1    43832
0     5010
Name: country, dtype: int64

In [83]:
df1.drop('native_country', axis = 1, inplace = True)

In [85]:
df1.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,train_ind,target_variable,relationship_ Husband,relationship_ Not-in-family,...,occupation_ Transport-moving,occupation_ unknown,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male,country
0,39,77516,13,2174,0,40,1,0,0,1,...,0,0,0,0,0,0,1,0,1,1
1,50,83311,13,0,0,13,1,0,1,0,...,0,0,0,0,0,0,1,0,1,1
2,38,215646,9,0,0,40,1,0,0,1,...,0,0,0,0,0,0,1,0,1,1
3,53,234721,7,0,0,40,1,0,1,0,...,0,0,0,0,1,0,0,0,1,1
4,28,338409,13,0,0,40,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [86]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, pydotplus
from sklearn import tree, metrics, model_selection, preprocessing
from IPython.display import Image, display

In [87]:
final_train_set = df1[df1["train_ind"] == 1]

In [94]:
final_train_set.shape

(32561, 68)

In [89]:
final_test_set = df1[df1["train_ind"] == 0]

In [90]:
final_test_set.shape

(16281, 69)

In [92]:
# select features
y = final_train_set.pop('target_variable')

In [93]:
y.shape

(32561,)

In [95]:
X = final_train_set

In [96]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=0)

In [97]:
# train the decision tree
dtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
dtree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [98]:
# use the model to make predictions with the test data
y_pred = dtree.predict(X_test)

In [99]:
# how did our model perform?
count_misclassified = (y_test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 1520
Accuracy: 0.84
