In [45]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn_evaluation import plot
import loadData
import pandas as pd
import numpy as np
from sklearn import tree
import graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree._tree import TREE_LEAF
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.model_selection import StratifiedShuffleSplit
from mlxtend.plotting import plot_decision_regions

In [57]:
# load data
training_set, testing_set = loadData.loadDataWithoutTestSet('processed.cleveland.data', split_to_test=True)
# get features and labels
training_set_labels = training_set.iloc[:,-1]
training_set = training_set.iloc[:,0:-1]

training_set_labels.reset_index(drop=True, inplace=True)
training_set.reset_index(drop=True, inplace=True)

testing_set_labels = testing_set.iloc[:,-1]
testing_set = testing_set.iloc[:,0:-1]

testing_set_labels.reset_index(drop=True, inplace=True)
testing_set.reset_index(drop=True, inplace=True)

print(training_set.shape)
print(testing_set.shape)

training_set.replace('?',0, inplace=True)
testing_set.replace('?',0, inplace=True)

training_set[['ca','thal']] = training_set[['ca','thal']].apply(pd.to_numeric)
testing_set[['ca','thal']] = testing_set[['ca','thal']].apply(pd.to_numeric)

(202, 13)
(101, 13)


In [61]:
# decision tree
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(training_set, training_set_labels)

predicted_training_set_labels = dtc.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = dtc.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

1.0
0.44554455445544555


In [62]:
criterion = ['gini', 'entropy']
splitter  = ['best','random']
min_samples_split = [2, 3, 4]
min_samples_leaf = [1, 2, 3, 4, 5]
param_grid = dict(criterion = criterion, splitter = splitter, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
grid_dtc = GridSearchCV(estimator=dtc, param_grid=param_grid)
grid_result_dtc = grid_dtc.fit(training_set, training_set_labels)


print("Best: %f using %s" % (grid_result_dtc.best_score_, grid_result_dtc.best_params_))
means = grid_result_dtc.cv_results_['mean_test_score']
stds = grid_result_dtc.cv_results_['std_test_score']
params = grid_result_dtc.cv_results_['params']
results = zip(means, stds, params)
for mean, stdev, param in sorted(results, key=lambda x: x[0], reverse=True):
    print("%f (%f) with: %r" % (mean, stdev, param))
    

predicted_training_set_labels = grid_dtc.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = grid_dtc.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

Best: 0.613861 using {'criterion': 'gini', 'min_samples_leaf': 2, 'min_samples_split': 3, 'splitter': 'random'}
0.613861 (0.033870) with: {'criterion': 'gini', 'min_samples_leaf': 2, 'min_samples_split': 3, 'splitter': 'random'}
0.608911 (0.036885) with: {'criterion': 'gini', 'min_samples_leaf': 5, 'min_samples_split': 4, 'splitter': 'random'}
0.603960 (0.018629) with: {'criterion': 'gini', 'min_samples_leaf': 5, 'min_samples_split': 3, 'splitter': 'random'}
0.599010 (0.023474) with: {'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
0.599010 (0.054044) with: {'criterion': 'entropy', 'min_samples_leaf': 4, 'min_samples_split': 3, 'splitter': 'random'}
0.594059 (0.027004) with: {'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 3, 'splitter': 'random'}
0.589109 (0.040283) with: {'criterion': 'entropy', 'min_samples_leaf': 4, 'min_samples_split': 2, 'splitter': 'random'}
0.589109 (0.028899) with: {'criterion': 'entropy', 'min_samples

In [63]:
ax = plot.grid_search(grid_dtc.grid_scores_, change=('min_samples_leaf', 'min_samples_split'), 
                      subset={'criterion': 'entropy', 'splitter': 'random'})
fig = ax.get_figure()
fig.set_size_inches(18.5, 10.5)
fig.savefig('dataset2_dtc1.png')
fig.clf()
ax = plot.grid_search(grid_dtc.grid_scores_, change=('min_samples_leaf', 'min_samples_split'), 
                      subset={'criterion': 'gini', 'splitter': 'random'})
fig = ax.get_figure()
fig.savefig('dataset2_dtc2.png')
fig.clf()
ax = plot.grid_search(grid_dtc.grid_scores_, change=('min_samples_leaf', 'min_samples_split'), 
                      subset={'criterion': 'entropy', 'splitter': 'best'})
fig = ax.get_figure()
fig.savefig('dataset2_dtc3.png')
fig.clf()
ax = plot.grid_search(grid_dtc.grid_scores_, change=('min_samples_leaf', 'min_samples_split'), 
                      subset={'criterion': 'gini', 'splitter': 'best'})
fig = ax.get_figure()
fig.savefig('dataset2_dtc4.png')
fig.clf()



<Figure size 1332x756 with 0 Axes>

In [64]:
dtc = tree.DecisionTreeClassifier(criterion= 'entropy', min_samples_leaf= 1, min_samples_split= 2, splitter= 'random')
dtc.fit(training_set, training_set_labels)
dot_data = tree.export_graphviz(dtc, out_file='dataset2_tree.dot', feature_names=training_set.columns, 
                                filled=True, rounded=True)  
graph = graphviz.Source(dot_data)
from subprocess import check_call
check_call(['dot','-Tpng','dataset2_tree.dot','-o','dataset2_tree.png'])

0

In [65]:
# neural networks
nn = MLPClassifier(activation='logistic', solver='sgd')
nn.fit(training_set, training_set_labels)

predicted_training_set_labels = nn.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = nn.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

0.5495049504950495
0.5247524752475248


In [66]:
alphas = [0.0001, 0.0002, 0.0005, 0.0007, 0.001, 0.002, 0.005, 0.01]
learning_rates = [0.001, 0.002, 0.005, 0.007, 0.01]
momentums = [0.9, 0.85, 0.8, 0.75, 0.7]
param_grid = dict(alpha=alphas, learning_rate_init = learning_rates, momentum = momentums)
grid_nn = GridSearchCV(estimator=nn, param_grid=param_grid)
grid_result_nn = grid_nn.fit(training_set, training_set_labels)


print("Best: %f using %s" % (grid_result_nn.best_score_, grid_result_nn.best_params_))
means = grid_result_nn.cv_results_['mean_test_score']
stds = grid_result_nn.cv_results_['std_test_score']
params = grid_result_nn.cv_results_['params']
results = zip(means, stds, params)
for mean, stdev, param in sorted(results, key=lambda x: x[0], reverse=True):
    print("%f (%f) with: %r" % (mean, stdev, param))
    

predicted_training_set_labels = grid_nn.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc_nn=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = grid_nn.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc_nn=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc_nn)
print(testing_set_acc_nn)























Best: 0.564356 using {'alpha': 0.0005, 'learning_rate_init': 0.005, 'momentum': 0.85}
0.564356 (0.026255) with: {'alpha': 0.0005, 'learning_rate_init': 0.005, 'momentum': 0.85}
0.564356 (0.026255) with: {'alpha': 0.001, 'learning_rate_init': 0.007, 'momentum': 0.75}
0.564356 (0.035298) with: {'alpha': 0.002, 'learning_rate_init': 0.005, 'momentum': 0.9}
0.564356 (0.026255) with: {'alpha': 0.01, 'learning_rate_init': 0.007, 'momentum': 0.8}
0.559406 (0.039644) with: {'alpha': 0.0002, 'learning_rate_init': 0.005, 'momentum': 0.9}
0.559406 (0.028149) with: {'alpha': 0.0007, 'learning_rate_init': 0.007, 'momentum': 0.85}
0.559406 (0.019699) with: {'alpha': 0.0007, 'learning_rate_init': 0.007, 'momentum': 0.75}
0.559406 (0.028149) with: {'alpha': 0.002, 'learning_rate_init': 0.007, 'momentum': 0.75}
0.559406 (0.019699) with: {'alpha': 0.002, 'learning_rate_init': 0.007, 'momentum': 0.7}
0.554455 (0.010535) with: {'alpha': 0.0001, 'learning_rate_init': 0.002, 'momentum': 0.9}
0.554455 (0.013

In [67]:
ax = plot.grid_search(grid_nn.grid_scores_, change=('alpha', 'learning_rate_init'),
              subset={'momentum': 0.85})
fig = ax.get_figure()
fig.set_size_inches(18.5, 10.5)
fig.savefig('dataset2_NN1.png')
fig.clf()
ax = plot.grid_search(grid_nn.grid_scores_, change=('alpha', 'momentum'),
              subset={'learning_rate_init': 0.005})
fig = ax.get_figure()
fig.savefig('dataset2_NN2.png')
fig.clf()



<Figure size 1332x756 with 0 Axes>

In [68]:
# boosting
gbc = GradientBoostingClassifier()
gbc.fit(training_set, training_set_labels)

predicted_training_set_labels = gbc.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = gbc.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

1.0
0.5742574257425742


In [69]:
nEstimators = [100, 300, 500, 700, 900]
learning_rates = [0.1, 0.3, 0.5, 0.7, 1.0]
param_grid = dict(learning_rate = learning_rates, n_estimators = nEstimators)
grid_b = GridSearchCV(estimator=gbc, param_grid=param_grid)
grid_result_b = grid_b.fit(training_set, training_set_labels)


print("Best: %f using %s" % (grid_result_b.best_score_, grid_result_b.best_params_))
means = grid_result_b.cv_results_['mean_test_score']
stds = grid_result_b.cv_results_['std_test_score']
params = grid_result_b.cv_results_['params']
results = zip(means, stds, params)
for mean, stdev, param in sorted(results, key=lambda x: x[0], reverse=True):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
predicted_training_set_labels = grid_b.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = grid_b.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

Best: 0.594059 using {'learning_rate': 0.3, 'n_estimators': 500}
0.594059 (0.031843) with: {'learning_rate': 0.3, 'n_estimators': 500}
0.594059 (0.021379) with: {'learning_rate': 0.5, 'n_estimators': 900}
0.589109 (0.028104) with: {'learning_rate': 0.5, 'n_estimators': 300}
0.589109 (0.030849) with: {'learning_rate': 1.0, 'n_estimators': 100}
0.589109 (0.020772) with: {'learning_rate': 1.0, 'n_estimators': 500}
0.589109 (0.020772) with: {'learning_rate': 1.0, 'n_estimators': 900}
0.584158 (0.024111) with: {'learning_rate': 0.1, 'n_estimators': 300}
0.584158 (0.024111) with: {'learning_rate': 0.1, 'n_estimators': 500}
0.584158 (0.024111) with: {'learning_rate': 0.1, 'n_estimators': 700}
0.584158 (0.031565) with: {'learning_rate': 0.3, 'n_estimators': 100}
0.584158 (0.034887) with: {'learning_rate': 0.5, 'n_estimators': 100}
0.584158 (0.034887) with: {'learning_rate': 0.5, 'n_estimators': 500}
0.579208 (0.030978) with: {'learning_rate': 0.5, 'n_estimators': 700}
0.574257 (0.021778) with:

In [70]:
ax = plot.grid_search(grid_b.grid_scores_, change=('learning_rate', 'n_estimators'))
fig = ax.get_figure()
fig.set_size_inches(18.5, 10.5)
fig.savefig('dataset2_boosting1.png')
fig.clf()



<Figure size 1332x756 with 0 Axes>

In [71]:
# KNN
knn = KNeighborsClassifier()
knn.fit(training_set, training_set_labels)

predicted_training_set_labels = knn.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = knn.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

0.5792079207920792
0.5247524752475248


In [72]:
n_neighbors = [5, 6, 7, 8, 9, 10]
weights = ["uniform", "distance"]
p = [1, 2, 3, 4]
param_grid = dict(n_neighbors = n_neighbors, weights = weights, p = p)
grid_knn = GridSearchCV(estimator=knn, param_grid=param_grid)
grid_result_knn = grid_knn.fit(training_set, training_set_labels)


print("Best: %f using %s" % (grid_result_knn.best_score_, grid_result_knn.best_params_))
means = grid_result_knn.cv_results_['mean_test_score']
stds = grid_result_knn.cv_results_['std_test_score']
params = grid_result_knn.cv_results_['params']
results = zip(means, stds, params)
for mean, stdev, param in sorted(results, key=lambda x: x[0], reverse=True):
    print("%f (%f) with: %r" % (mean, stdev, param))
    

predicted_training_set_labels = grid_knn.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = grid_knn.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

Best: 0.524752 using {'n_neighbors': 6, 'p': 1, 'weights': 'uniform'}
0.524752 (0.027067) with: {'n_neighbors': 6, 'p': 1, 'weights': 'uniform'}
0.519802 (0.008919) with: {'n_neighbors': 10, 'p': 1, 'weights': 'uniform'}
0.519802 (0.009401) with: {'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
0.519802 (0.009401) with: {'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}
0.519802 (0.009401) with: {'n_neighbors': 10, 'p': 4, 'weights': 'uniform'}
0.514851 (0.031325) with: {'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
0.514851 (0.016534) with: {'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}
0.514851 (0.016534) with: {'n_neighbors': 9, 'p': 3, 'weights': 'uniform'}
0.514851 (0.016534) with: {'n_neighbors': 9, 'p': 4, 'weights': 'uniform'}
0.514851 (0.016123) with: {'n_neighbors': 10, 'p': 2, 'weights': 'distance'}
0.509901 (0.024375) with: {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
0.509901 (0.015171) with: {'n_neighbors': 8, 'p': 1, 'weights': 'distance'}
0.509901 (0.019898) wi

In [73]:
ax = plot.grid_search(grid_knn.grid_scores_, change=('n_neighbors', 'p'), subset={'weights': 'uniform'})
fig = ax.get_figure()
fig.set_size_inches(18.5, 10.5)
fig.savefig('dataset2_knn1.png')
fig.clf()
ax = plot.grid_search(grid_knn.grid_scores_, change=('n_neighbors', 'p'), subset={'weights': 'distance'})
fig = ax.get_figure()
fig.set_size_inches(18.5, 10.5)
fig.savefig('dataset2_knn2.png')
fig.clf()



<Figure size 1332x756 with 0 Axes>

In [74]:
# SVM
svm_c = svm.SVC()
svm_c.fit(training_set, training_set_labels)

predicted_training_set_labels = svm_c.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = svm_c.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

1.0
0.5247524752475248


In [78]:
C_range = 10. ** np.arange(-3, 8)
gamma_range = 10. ** np.arange(-5, 4)
param_grid = dict(C = C_range, gamma = gamma_range)
grid_svc = GridSearchCV(estimator=svm_c, param_grid=param_grid)
grid_result_svc = grid_svc.fit(training_set, training_set_labels)


print("Best: %f using %s" % (grid_result_svc.best_score_, grid_result_svc.best_params_))
means = grid_result_svc.cv_results_['mean_test_score']
stds = grid_result_svc.cv_results_['std_test_score']
params = grid_result_svc.cv_results_['params']
results = zip(means, stds, params)
for mean, stdev, param in sorted(results, key=lambda x: x[0], reverse=True):
    print("%f (%f) with: %r" % (mean, stdev, param))
    

predicted_training_set_labels = grid_svc.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = grid_svc.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

Best: 0.594059 using {'C': 10000.0, 'gamma': 1e-05}
0.594059 (0.027943) with: {'C': 10000.0, 'gamma': 1e-05}
0.579208 (0.051288) with: {'C': 100000.0, 'gamma': 1e-05}
0.574257 (0.026815) with: {'C': 1000.0, 'gamma': 1e-05}
0.559406 (0.051968) with: {'C': 10000000.0, 'gamma': 1e-05}
0.554455 (0.021027) with: {'C': 100.0, 'gamma': 1e-05}
0.549505 (0.013970) with: {'C': 0.001, 'gamma': 1e-05}
0.549505 (0.013970) with: {'C': 0.001, 'gamma': 0.0001}
0.549505 (0.013970) with: {'C': 0.001, 'gamma': 0.001}
0.549505 (0.013970) with: {'C': 0.001, 'gamma': 0.01}
0.549505 (0.013970) with: {'C': 0.001, 'gamma': 0.1}
0.549505 (0.013970) with: {'C': 0.001, 'gamma': 1.0}
0.549505 (0.013970) with: {'C': 0.001, 'gamma': 10.0}
0.549505 (0.013970) with: {'C': 0.001, 'gamma': 100.0}
0.549505 (0.013970) with: {'C': 0.001, 'gamma': 1000.0}
0.549505 (0.013970) with: {'C': 0.01, 'gamma': 1e-05}
0.549505 (0.013970) with: {'C': 0.01, 'gamma': 0.0001}
0.549505 (0.013970) with: {'C': 0.01, 'gamma': 0.001}
0.549505

In [80]:
ax = plot.grid_search(grid_svc.grid_scores_, change=('C', 'gamma'))
fig = ax.get_figure()
fig.set_size_inches(18.5, 10.5)
fig.savefig('dataset2_svm.png')
fig.clf()



<Figure size 1332x756 with 0 Axes>

In [82]:
linear_svc=svm.LinearSVC()
C_range = 10. ** np.arange(-3, 8)
param_grid = dict(C = C_range, kernel = kernel)
grid_svc = GridSearchCV(estimator=svm_c, param_grid=param_grid)
grid_result_svc = grid_svc.fit(training_set, training_set_labels)


print("Best: %f using %s" % (grid_result_svc.best_score_, grid_result_svc.best_params_))
means = grid_result_svc.cv_results_['mean_test_score']
stds = grid_result_svc.cv_results_['std_test_score']
params = grid_result_svc.cv_results_['params']
results = zip(means, stds, params)
for mean, stdev, param in sorted(results, key=lambda x: x[0], reverse=True):
    print("%f (%f) with: %r" % (mean, stdev, param))
    

predicted_training_set_labels = grid_svc.predict(training_set)
predicted_training_set_labels = pd.DataFrame(predicted_training_set_labels) 
training_set_acc=accuracy_score(training_set_labels, predicted_training_set_labels)

predicted_testing_set_labels = grid_svc.predict(testing_set)
predicted_testing_set_labels = pd.DataFrame(predicted_testing_set_labels)
testing_set_acc=accuracy_score(testing_set_labels, predicted_testing_set_labels)

print(training_set_acc)
print(testing_set_acc)

KeyboardInterrupt: 