After tuning, the best n_neighbors of the K-Neighbors Classifier is 3, the best max_depth of the Decision Tree Classifier is 7.
We can discover that the testing accuracy for the ensemble method: voting, bagging, and boosting are all 0.823.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import itertools
import csv
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error, roc_curve, auc
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier

file = open('D:/course/deep learning/bonus/train.csv', encoding='utf-8')
reader = csv.reader(file)
next(reader)
X = np.ndarray((0, 2))
y = np.ndarray((0,))
y_mapping = {'Bob': 0, 'Kate': 1, 'Mark': 2, 'Sue': 3}
i = 0
for row in reader:
    i += 1
    X = np.vstack((X, np.array(row[0:2])))
    y = np.append(y, y_mapping[row[2]])
X = X.astype(np.float)
y = y.astype(np.float)
file.close()

In [2]:
# hold out testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# hold out validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

In [3]:
# GridSearchCV: tune the n_neighbors of the K-Neighbors Classifier
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=1)

outer_scores = []
# outer folds
for i, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
    print('[Outer fold %d/5]' % (i + 1))
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    pipe = Pipeline([['sc', StandardScaler()], ['clf', KNeighborsClassifier()]])
    # hyperparameter tuning by grid search CV
    param_grid = {'clf__n_neighbors':list(range(1, 11))}
    gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='accuracy', cv=inner_cv)
    gs.fit(X_train, y_train)
    best_clf = gs.best_estimator_
    best_clf.fit(X_train, y_train)    
    outer_scores.append(best_clf.score(X_test, y_test))
    print('Test accuracy: %.2f (n_neighbors=%d selected by inner 10-fold CV)' % 
                  (outer_scores[i], gs.best_params_['clf__n_neighbors']))

print('\nTest accuracy: %.2f (5x10 nested CV)' % np.mean(outer_scores))

[Outer fold 1/5]
Test accuracy: 0.82 (n_neighbors=5 selected by inner 10-fold CV)
[Outer fold 2/5]
Test accuracy: 0.86 (n_neighbors=3 selected by inner 10-fold CV)
[Outer fold 3/5]
Test accuracy: 0.77 (n_neighbors=3 selected by inner 10-fold CV)
[Outer fold 4/5]
Test accuracy: 0.82 (n_neighbors=3 selected by inner 10-fold CV)
[Outer fold 5/5]
Test accuracy: 0.81 (n_neighbors=3 selected by inner 10-fold CV)

Test accuracy: 0.81 (5x10 nested CV)


In [4]:
# GridSearchCV: tune the max_depth of the Decision Tree Classifier
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
inner_cv = KFold(n_splits=10, shuffle=True, random_state=1)

outer_scores = []
# outer folds
for i, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
    print('[Outer fold %d/5]' % (i + 1))
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    pipe = Pipeline([['sc', StandardScaler()], ['clf', DecisionTreeClassifier()]])
    # hyperparameter tuning by grid search CV
    param_grid = {'clf__max_depth':list(range(1, 11))}
    gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='accuracy', cv=inner_cv)
    gs.fit(X_train, y_train)
    best_clf = gs.best_estimator_
    best_clf.fit(X_train, y_train)    
    outer_scores.append(best_clf.score(X_test, y_test))
    print('Test accuracy: %.2f (max_depth=%d selected by inner 10-fold CV)' % 
                  (outer_scores[i], gs.best_params_['clf__max_depth']))

print('\nTest accuracy: %.2f (5x10 nested CV)' % np.mean(outer_scores))

[Outer fold 1/5]
Test accuracy: 0.80 (max_depth=7 selected by inner 10-fold CV)
[Outer fold 2/5]
Test accuracy: 0.78 (max_depth=7 selected by inner 10-fold CV)
[Outer fold 3/5]
Test accuracy: 0.78 (max_depth=6 selected by inner 10-fold CV)
[Outer fold 4/5]
Test accuracy: 0.82 (max_depth=7 selected by inner 10-fold CV)
[Outer fold 5/5]
Test accuracy: 0.80 (max_depth=7 selected by inner 10-fold CV)

Test accuracy: 0.80 (5x10 nested CV)




In [5]:
# ensemble method: voting
pipe1 = Pipeline([['sc', StandardScaler()], ['clf', LogisticRegression(C = 10, random_state = 0, solver = "liblinear")]])
pipe2 = Pipeline([['clf', DecisionTreeClassifier(max_depth = 7, random_state = 0)]])
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', KNeighborsClassifier(n_neighbors = 3)]])

clf = VotingClassifier(estimators=[('lr', pipe1), ('dt', pipe2), ('knn', pipe3)], voting='soft')
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

clf_train = accuracy_score(y_train, y_train_pred) 
clf_test = accuracy_score(y_test, y_test_pred) 
print('[Voting] accuracy-train = %.3f, accuracy-test = %.3f' % (clf_train, clf_test))

[Voting] accuracy-train = 0.922, accuracy-test = 0.823




In [6]:
# ensemble method: bagging
tree = DecisionTreeClassifier(criterion='entropy', max_depth=7, random_state=0)
bag = BaggingClassifier(base_estimator=tree, n_estimators=1000, max_samples=0.7, bootstrap=True,
                        max_features=1.0, bootstrap_features=False, n_jobs=1, random_state=1)

# Bagging
bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print('[Bagging] accuracy-train = %.3f, accuracy-test = %.3f' % (bag_train, bag_test))

[Bagging] accuracy-train = 0.904, accuracy-test = 0.823


In [7]:
# ensemble method: boosting
tree = DecisionTreeClassifier(criterion='entropy', max_depth=7)

# adaboost
ada = AdaBoostClassifier(base_estimator=tree, n_estimators=1000)
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print('[AdaBoost] accuracy-train = %.3f, accuracy-test = %.3f' % 
      (ada_train, ada_test))

[AdaBoost] accuracy-train = 1.000, accuracy-test = 0.823
