In [11]:
import loadData
import pandas as pd
import numpy as np
from sklearn import tree
import graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree._tree import TREE_LEAF
from sklearn.model_selection import GridSearchCV

In [3]:
adult_training_set, adult_testing_set = loadData.loadDataWithTestSet('adult.data', 'adult.test')
#hd_training_set, hd_testing_set = loadData.loadDataWithoutTestSet('processed.cleveland.data', True)

In [4]:
# preprocess data
# training set
adult_training_set_labels = adult_training_set.iloc[:,-1]
adult_training_set = adult_training_set.iloc[:,0:-1]
adult_training_set_labels[adult_training_set_labels.str.contains('>50K',na=False)] = 1
adult_training_set_labels[adult_training_set_labels.str.contains('<=50K',na=False)] = 0
adult_training_set_labels = adult_training_set_labels.astype('int')
# testing set
adult_testing_set_labels = adult_testing_set.iloc[:,-1]
adult_testing_set = adult_testing_set.iloc[:,0:-1]
adult_testing_set_labels[adult_testing_set_labels.str.contains('>50K',na=False)] = 1
adult_testing_set_labels[adult_testing_set_labels.str.contains('<=50K',na=False)] = 0
adult_testing_set_labels = adult_testing_set_labels.astype('int')

processed_adult_training_set = pd.get_dummies(adult_training_set, columns=['workclass','education','marital-status','occupation','relationship',
                                           'race','sex','native-country'])
processed_adult_testing_set = pd.get_dummies(adult_testing_set, columns=['workclass','education','marital-status','occupation','relationship',
                                           'race','sex','native-country'])
print(processed_adult_training_set.columns[~processed_adult_training_set.columns.isin(processed_adult_testing_set.columns)])
missing_column = np.zeros(16281)
missing_column = pd.DataFrame(missing_column, columns=['native-country_Holand-Netherlands'])
processed_adult_testing_set = pd.concat([processed_adult_testing_set,missing_column], axis=1)

Index(['native-country_Holand-Netherlands'], dtype='object')


In [67]:
# decision tree
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(processed_adult_training_set, adult_training_set_labels)

def prune_index(inner_tree, index, threshold):
    if inner_tree.value[index].min() < threshold:
        # turn node into a leaf by "unlinking" its children
        inner_tree.children_left[index] = TREE_LEAF
        inner_tree.children_right[index] = TREE_LEAF
    # if there are shildren, visit them as well
    if inner_tree.children_left[index] != TREE_LEAF:
        prune_index(inner_tree, inner_tree.children_left[index], threshold)
        prune_index(inner_tree, inner_tree.children_right[index], threshold)

training_set_acc = []
testing_set_acc = []
for x in range(0, 100, 5):
    prune_index(dtc.tree_, 0, x)
    predicted_training_set_labels = dtc.predict(processed_adult_training_set)
    predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
    training_set_acc.append(accuracy_score(adult_training_set_labels, predicted_training_set_labels))
    
    predicted_testing_set_labels = dtc.predict(processed_adult_testing_set)
    predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
    testing_set_acc.append(accuracy_score(adult_testing_set_labels, predicted_testing_set_labels))
print(training_set_acc)
print(testing_set_acc)


[0.9999692884125181, 0.9318202757900556, 0.9104450109026135, 0.8995116857590368, 0.8925094438131507, 0.8882712447406407, 0.8847701237676976, 0.8825896010564787, 0.880900463744971, 0.879457019133319, 0.8778907281717392, 0.8760480329228217, 0.874512453548724, 0.8736218175117472, 0.8729154509996622, 0.8719633917877215, 0.8719019686127576, 0.8695985995516108, 0.8692914836767912, 0.8690457909769356]
[0.7977396965788343, 0.8213254714083902, 0.8299244518149991, 0.8317670904735581, 0.8325041459369817, 0.8436828204655734, 0.846446778453412, 0.8491493151526319, 0.8512990602542841, 0.852158958294945, 0.8510533750998096, 0.8515447454087587, 0.8526503286038941, 0.8530802776242246, 0.8540630182421227, 0.8540630182421227, 0.8541244395307414, 0.8525889073152755, 0.8536330692217923, 0.8536944905104109]


In [68]:
# graph decision tree as png
dot_data = tree.export_graphviz(dtc, out_file='tree1.dot', feature_names=processed_adult_training_set.columns, 
                                filled=True, rounded=True)  
graph = graphviz.Source(dot_data)
from subprocess import check_call
check_call(['dot','-Tpng','tree1.dot','-o','tree1.png'])

0

In [9]:
# neural networks
nn = MLPClassifier(activation='logistic', solver='sgd')
nn.fit(processed_adult_training_set, adult_training_set_labels)

predicted_training_set_labels = nn.predict(processed_adult_training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(adult_training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = nn.predict(processed_adult_testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(adult_testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

[0.7884892970117625]
[0.789017873594988]


In [29]:
alphas = [0.0001, 0.0002, 0.0005, 0.0007, 0.001, 0.002, 0.005, 0.01]
learning_rates = [0.001, 0.002, 0.005, 0.007, 0.01]
momentums = [0.9, 0.85, 0.8, 0.75, 0.7]
param_grid = dict(alpha=alphas, learning_rate_init = learning_rates, momentum = momentums)
grid = GridSearchCV(estimator=nn, param_grid=param_grid)
grid_result = grid.fit(processed_adult_training_set, adult_training_set_labels)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results = zip(means, stds, params)
for mean, stdev, param in sorted(results, key=lambda x: x[0], reverse=True):
    print("%f (%f) with: %r" % (mean, stdev, param))
    

predicted_training_set_labels = grid.predict(processed_adult_training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(adult_training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = grid.predict(processed_adult_testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(adult_testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

Best: 0.827769 using {'alpha': 0.005, 'learning_rate_init': 0.01, 'momentum': 0.75}
0.827769 (0.013989) with: {'alpha': 0.005, 'learning_rate_init': 0.01, 'momentum': 0.75}
0.813734 (0.015538) with: {'alpha': 0.0001, 'learning_rate_init': 0.01, 'momentum': 0.85}
0.808268 (0.020466) with: {'alpha': 0.0007, 'learning_rate_init': 0.01, 'momentum': 0.8}
0.806425 (0.015964) with: {'alpha': 0.002, 'learning_rate_init': 0.01, 'momentum': 0.9}
0.806241 (0.008303) with: {'alpha': 0.001, 'learning_rate_init': 0.01, 'momentum': 0.85}
0.805626 (0.028530) with: {'alpha': 0.0005, 'learning_rate_init': 0.01, 'momentum': 0.7}
0.804889 (0.011511) with: {'alpha': 0.0001, 'learning_rate_init': 0.01, 'momentum': 0.7}
0.804766 (0.008201) with: {'alpha': 0.001, 'learning_rate_init': 0.01, 'momentum': 0.9}
0.804398 (0.030839) with: {'alpha': 0.001, 'learning_rate_init': 0.01, 'momentum': 0.75}
0.803876 (0.019503) with: {'alpha': 0.005, 'learning_rate_init': 0.01, 'momentum': 0.8}
0.803661 (0.029333) with: {'