## Decision trees
Author - Girish Shirke, Date - 14/03/2020

Build a tree to predict the income of a given population, which is labelled as <= 50K and >50K. The attributes (predictors) are age, working class type, marital status, gender, race etc.

***This assignment also includes plotting the decision tree using graphziv***

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import preprocessing

In [28]:
data = pd.read_csv('adult_dataset.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [30]:
data_que = data[data == '?']
data_que.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               0 non-null float64
workclass         1836 non-null object
fnlwgt            0 non-null float64
education         0 non-null object
education.num     0 non-null float64
marital.status    0 non-null object
occupation        1843 non-null object
relationship      0 non-null object
race              0 non-null object
sex               0 non-null object
capital.gain      0 non-null float64
capital.loss      0 non-null float64
hours.per.week    0 non-null float64
native.country    583 non-null object
income            0 non-null object
dtypes: float64(6), object(9)
memory usage: 3.7+ MB


  result = method(y)


In [31]:
data_cleaned = data[data['workclass'] != '?']
data_cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K
10,45,Private,172274,Doctorate,16,Divorced,Prof-specialty,Unmarried,Black,Female,0,3004,35,United-States,>50K
11,38,Self-emp-not-inc,164526,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Male,0,2824,45,United-States,>50K


In [32]:
data_cleaned.apply(lambda x : x == '?').sum()

age                 0
workclass           0
fnlwgt              0
education           0
education.num       0
marital.status      0
occupation          7
relationship        0
race                0
sex                 0
capital.gain        0
capital.loss        0
hours.per.week      0
native.country    556
income              0
dtype: int64

In [33]:
data_cleaned = data_cleaned[data_cleaned['native.country'] != '?' ]
data_cleaned = data_cleaned[data_cleaned['occupation'] != '?' ]
data_cleaned.apply(lambda x : x == '?').sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [34]:
final_data = data_cleaned

In [35]:
final_data_categorical = final_data.select_dtypes(include='object')

le = preprocessing.LabelEncoder()
final_data_categorical = final_data_categorical.apply(le.fit_transform)
final_data_categorical.head(10)

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
1,2,11,6,3,1,4,0,38,0
3,2,5,0,6,4,4,0,38,0
4,2,15,5,9,3,4,0,38,0
5,2,11,0,7,4,4,0,38,0
6,2,0,5,0,4,4,1,38,0
7,5,10,4,9,2,4,0,38,1
8,0,11,0,9,1,4,0,38,0
10,2,10,0,9,4,2,0,38,1
11,4,14,4,9,1,4,1,38,1
12,2,9,6,7,1,4,0,38,1


In [36]:
final_data = final_data.drop(final_data_categorical.columns, axis = 1)
final_data = pd.concat([final_data,final_data_categorical] , axis =1)

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
1,82,132870,9,0,4356,18,2,11,6,3,1,4,0,38,0
3,54,140359,4,0,3900,40,2,5,0,6,4,4,0,38,0
4,41,264663,10,0,3900,40,2,15,5,9,3,4,0,38,0
5,34,216864,9,0,3770,45,2,11,0,7,4,4,0,38,0
6,38,150601,6,0,3770,40,2,0,5,0,4,4,1,38,0
7,74,88638,16,0,3683,20,5,10,4,9,2,4,0,38,1
8,68,422013,9,0,3683,40,0,11,0,9,1,4,0,38,0
10,45,172274,16,0,3004,35,2,10,0,9,4,2,0,38,1
11,38,164526,15,0,2824,45,4,14,4,9,1,4,1,38,1
12,52,129177,13,0,2824,20,2,9,6,7,1,4,0,38,1


In [37]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 1 to 32560
Data columns (total 15 columns):
age               30162 non-null int64
fnlwgt            30162 non-null int64
education.num     30162 non-null int64
capital.gain      30162 non-null int64
capital.loss      30162 non-null int64
hours.per.week    30162 non-null int64
workclass         30162 non-null int64
education         30162 non-null int64
marital.status    30162 non-null int64
occupation        30162 non-null int64
relationship      30162 non-null int64
race              30162 non-null int64
sex               30162 non-null int64
native.country    30162 non-null int64
income            30162 non-null int64
dtypes: int64(15)
memory usage: 3.7 MB


In [39]:
final_data['income'] = final_data['income'].astype('category')

In [42]:
from sklearn.model_selection import train_test_split

y = final_data['income']
X = final_data.drop('income',axis =1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 99)

In [43]:
from sklearn.tree import DecisionTreeClassifier

dt_default = DecisionTreeClassifier(max_depth=5)
dt_default.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [47]:
y_pred_default = dt_default.predict(X_test)

In [48]:
from sklearn.metrics import classification_report

In [50]:
print(classification_report(y_true=y_test, y_pred= y_pred_default))

              precision    recall  f1-score   support

           0       0.86      0.95      0.91      6867
           1       0.78      0.52      0.63      2182

   micro avg       0.85      0.85      0.85      9049
   macro avg       0.82      0.74      0.77      9049
weighted avg       0.84      0.85      0.84      9049



In [52]:
print(confusion_matrix(y_test,y_pred_default))
print("Accuracy : ", accuracy_score(y_test,y_pred_default))

[[6553  314]
 [1039 1143]]
Accuracy :  0.8504807161012267
