# Decision Tree of Adult Data

Developers: Danielle Shackley, Brendan Dao
Principals of Machine Learning Final Project
Description: This script uses the 'adult' data csv file to create a decision tree with 4 levels and plots the feature importance. The data is transformed into numerical values with a labelEncoder function. 

In [None]:
!pip install mglearn
!pip install graphviz
!pip install pydot
sudo apt-get install graphviz

In [1]:
import mglearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'

# Decision trees

In [2]:
import sys

### Loading data set


In [3]:
df=pd.read_csv("adult.csv")
# view data
#df

In [4]:
X=df.iloc[:,0:14]
y=df.iloc[:,-1]
headers = np.array(list(df.columns[:-1].T))
# view data (X)
#X

Converting data to numeric values with labelEncoder

In [5]:
# Import LabelEncoder
from sklearn import preprocessing
# creating labelEncoder
le = preprocessing.LabelEncoder()
le.fit(y)
# Converting string labels into numbers.
y=le.fit_transform(y)


In [6]:
le.classes_

array([' <=50K', ' >50K'], dtype=object)

In [7]:

# View different y label values (converted >50K and <=50K to 0 and 1)
# >50K  -> 1
# <=50K -> 0
le.classes_
# view converted number - label
#le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
#print(le_name_mapping)


array([' <=50K', ' >50K'], dtype=object)

In [8]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
X=enc.fit_transform(X)
X.shape

(32561, 14)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
X=enc.fit_transform(X)
X.shape

In [None]:
# Display categories
#enc.categories_

In [None]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

In [None]:
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

### Analyzing Decision Trees

In [None]:
# Plotting decision tree
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file="tree.dot", class_names=[">50K", "<=50K"],
                feature_names=(list(df.columns[:-1].T)), impurity=False, filled=True)

In [None]:
#h=np.array(list(df.columns[:-2].T))
(list(df.columns[:-1].T))

In [None]:
import graphviz
with open("tree.dot") as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))

In [None]:

import pydot
(graph,) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('somefile.png')

### Feature Importance in trees

In [None]:
print("Feature importances:")
print(tree.feature_importances_)

In [None]:
X.data.shape

In [None]:
np.arange(4)

In [None]:
def plot_feature_importances_cancer(model):
    n_features = X.shape[1]
    plt.barh(np.arange(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), (list(df.columns[:-1].T)))
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.ylim(-1, n_features)

plot_feature_importances_cancer(tree)