In [61]:
import loadData
import pandas as pd
import numpy as np
from sklearn import tree
import graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [62]:
adult_training_set, adult_testing_set = loadData.loadDataWithTestSet('adult.data', 'adult.test')
#hd_training_set, hd_testing_set = loadData.loadDataWithoutTestSet('processed.cleveland.data', True)

In [63]:
# preprocess data
# training set
adult_training_set_labels = adult_training_set.iloc[:,-1]
adult_training_set = adult_training_set.iloc[:,0:-1]
adult_training_set_labels[adult_training_set_labels.str.contains('>50K',na=False)] = 1
adult_training_set_labels[adult_training_set_labels.str.contains('<=50K',na=False)] = 0
adult_training_set_labels = adult_training_set_labels.astype('int')
# testing set
adult_testing_set_labels = adult_testing_set.iloc[:,-1]
adult_testing_set = adult_testing_set.iloc[:,0:-1]
adult_testing_set_labels[adult_testing_set_labels.str.contains('>50K',na=False)] = 1
adult_testing_set_labels[adult_testing_set_labels.str.contains('<=50K',na=False)] = 0
adult_testing_set_labels = adult_testing_set_labels.astype('int')

processed_adult_training_set = pd.get_dummies(adult_training_set, columns=['workclass','education','marital-status','occupation','relationship',
                                           'race','sex','native-country'])
processed_adult_testing_set = pd.get_dummies(adult_testing_set, columns=['workclass','education','marital-status','occupation','relationship',
                                           'race','sex','native-country'])
print(processed_adult_training_set.columns[~processed_adult_training_set.columns.isin(processed_adult_testing_set.columns)])
missing_column = np.zeros(16281)
missing_column = pd.DataFrame(missing_column, columns=['native-country_Holand-Netherlands'])
processed_adult_testing_set = pd.concat([processed_adult_testing_set,missing_column], axis=1)

Index(['native-country_Holand-Netherlands'], dtype='object')
   age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   25  226802              7             0             0              40   
1   38   89814              9             0             0              50   
2   28  336951             12             0             0              40   
3   44  160323             10          7688             0              40   
4   18  103497             10             0             0              30   

   workclass_?  workclass_Federal-gov  workclass_Local-gov  \
0            0                      0                    0   
1            0                      0                    0   
2            0                      0                    1   
3            0                      0                    0   
4            1                      0                    0   

   workclass_Never-worked                ...                  \
0                       0              

In [67]:
# decision tree
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(processed_adult_training_set, adult_training_set_labels)
from sklearn.tree._tree import TREE_LEAF

def prune_index(inner_tree, index, threshold):
    if inner_tree.value[index].min() < threshold:
        # turn node into a leaf by "unlinking" its children
        inner_tree.children_left[index] = TREE_LEAF
        inner_tree.children_right[index] = TREE_LEAF
    # if there are shildren, visit them as well
    if inner_tree.children_left[index] != TREE_LEAF:
        prune_index(inner_tree, inner_tree.children_left[index], threshold)
        prune_index(inner_tree, inner_tree.children_right[index], threshold)

training_set_acc = []
testing_set_acc = []
for x in range(0, 100, 5):
    prune_index(dtc.tree_, 0, x)
    predicted_training_set_labels = dtc.predict(processed_adult_training_set)
    predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
    training_set_acc.append(accuracy_score(adult_training_set_labels, predicted_training_set_labels))
    
    predicted_testing_set_labels = dtc.predict(processed_adult_testing_set)
    predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
    testing_set_acc.append(accuracy_score(adult_testing_set_labels, predicted_testing_set_labels))
print(training_set_acc)
print(testing_set_acc)


[0.9999692884125181, 0.9318202757900556, 0.9104450109026135, 0.8995116857590368, 0.8925094438131507, 0.8882712447406407, 0.8847701237676976, 0.8825896010564787, 0.880900463744971, 0.879457019133319, 0.8778907281717392, 0.8760480329228217, 0.874512453548724, 0.8736218175117472, 0.8729154509996622, 0.8719633917877215, 0.8719019686127576, 0.8695985995516108, 0.8692914836767912, 0.8690457909769356]
[0.7977396965788343, 0.8213254714083902, 0.8299244518149991, 0.8317670904735581, 0.8325041459369817, 0.8436828204655734, 0.846446778453412, 0.8491493151526319, 0.8512990602542841, 0.852158958294945, 0.8510533750998096, 0.8515447454087587, 0.8526503286038941, 0.8530802776242246, 0.8540630182421227, 0.8540630182421227, 0.8541244395307414, 0.8525889073152755, 0.8536330692217923, 0.8536944905104109]


In [68]:
# graph decision tree as png
dot_data = tree.export_graphviz(dtc, out_file='tree1.dot', feature_names=processed_adult_training_set.columns, 
                                filled=True, rounded=True)  
graph = graphviz.Source(dot_data)
from subprocess import check_call
check_call(['dot','-Tpng','tree1.dot','-o','tree1.png'])

0