In [66]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

from lab_tools import CIFAR10, get_hog_image

dataset = CIFAR10('../../extern_data/CIFAR10/')
X_train, y_train = dataset.train['hog'][:15000], dataset.train['labels'][:15000]
X_test, y_test = dataset.test['hog'], dataset.test['labels']


Pre-loading training data
Pre-loading test data


# Ridge Classifier

## Training & Tuning

In [3]:
tuned_parameters = [{"alpha": np.arange(0, 1, 0.2)}]

clf = GridSearchCV(RidgeClassifier(), tuned_parameters, cv=2)

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_, clf.best_score_)

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))


Best parameters set found on development set:
{'alpha': 0.2} 0.7156666666666667
0.690 (+/-0.027) for {'alpha': 0.0}
0.716 (+/-0.007) for {'alpha': 0.2}
0.704 (+/-0.003) for {'alpha': 0.4}
0.699 (+/-0.005) for {'alpha': 0.6000000000000001}
0.695 (+/-0.003) for {'alpha': 0.8}


## Test

In [4]:
clf = RidgeClassifier(alpha = 0.2)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, pred)
print(f"Accuracy: {accuracy:.3f}")

cm = confusion_matrix(y_test, pred)
print(cm)

Accuracy: 0.720
[[744 186  70]
 [133 659 208]
 [ 72 172 756]]


# Nearest Neighbor

## Training & Tuning

In [5]:
# NORMALIZATION
X_train_NN = preprocessing.normalize(X_train, axis=0)
X_test_NN = preprocessing.normalize(X_test, axis=0)

tuned_parameters = {"n_neighbors": np.arange(1, 30, 2), "weights": ["distance", "uniform"], "p": [1]}

clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=2)

#clf = KNeighborsClassifier(n_neighbors=10, weights='distance', p = 1)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_)
print()

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters set found on development set:
{'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}

0.673 (+/-0.015) for {'n_neighbors': 1, 'p': 1, 'weights': 'distance'}
0.673 (+/-0.015) for {'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
0.692 (+/-0.011) for {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
0.696 (+/-0.003) for {'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
0.705 (+/-0.003) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.719 (+/-0.008) for {'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
0.718 (+/-0.000) for {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
0.722 (+/-0.009) for {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
0.716 (+/-0.011) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.720 (+/-0.007) for {'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}
0.715 (+/-0.016) for {'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
0.719 (+/-0.017) for {'n_neighbors': 11, 'p': 1, 'weights': 'uniform'}
0.713 (+/-0.009) for {'n_neighbors': 13, 'p': 1, 'weight

## Test

In [6]:
clf = KNeighborsClassifier(n_neighbors=7, weights='uniform', p = 1)

X_train_NN = preprocessing.normalize(X_train, axis=0)
X_test_NN = preprocessing.normalize(X_test, axis=0)

clf.fit(X_train_NN, y_train)

pred = clf.predict(X_test_NN)

accuracy = accuracy_score(y_test, pred)
print(f"Accuracy: {accuracy:.3f}")

cm = confusion_matrix(y_test, pred)
print(cm)

Accuracy: 0.761
[[751 188  61]
 [ 98 763 139]
 [ 36 195 769]]


# Decision Trees

## Training & Tuning

In [20]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

tuned_parameters = [{'criterion': ['gini', 'entropy', 'log_loss'], 
                     "random_state": [0, 1, 5], 
                     'max_depth': [1, 5, 10, 15, 20],
                     'splitter': ["best", "random"],
                     'ccp_alpha': [0, 0.1, 0.5, 1, 10]
                    }]

clf = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=2)

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_)

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters set found on development set:
{'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 10, 'random_state': 0, 'splitter': 'best'}
0.426 (+/-0.013) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 0, 'splitter': 'best'}
0.425 (+/-0.040) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 0, 'splitter': 'random'}
0.426 (+/-0.013) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 1, 'splitter': 'best'}
0.413 (+/-0.001) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 1, 'splitter': 'random'}
0.426 (+/-0.013) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 5, 'splitter': 'best'}
0.448 (+/-0.006) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 5, 'splitter': 'random'}
0.536 (+/-0.036) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 5, 'random_state': 0, 'splitter': 'best'}
0.541 (+/-0.009) for {'ccp_alpha': 0, 'criterion': 'gini

## Test

In [10]:

clf = DecisionTreeClassifier(random_state=5)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

score = accuracy_score(y_test, pred)
print(f"Normal Nearest Neighbor: {score:.3f}")
cm = confusion_matrix(y_test, pred)
print(cm)


Normal Nearest Neighbor: 0.531
[[562 260 178]
 [207 525 268]
 [185 310 505]]


# Random Forest

## Training & Tuning

In [28]:
from sklearn.ensemble import RandomForestClassifier

tuned_parameters = [{
                     'n_estimators': [10, 100, 200],
                     'criterion': ['gini', 'entropy'], 
                     "random_state": [0, 1, 5], 
                     'max_depth': [1, 5, 10, 15, 20]
                    }]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2)

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_, clf.best_score_)

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters set found on development set:
{'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 200, 'random_state': 1} 0.7350000000000001
0.526 (+/-0.013) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 10, 'random_state': 0}
0.523 (+/-0.015) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 10, 'random_state': 1}
0.497 (+/-0.034) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 10, 'random_state': 5}
0.555 (+/-0.023) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 100, 'random_state': 0}
0.567 (+/-0.025) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 100, 'random_state': 1}
0.560 (+/-0.033) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 100, 'random_state': 5}
0.570 (+/-0.032) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 200, 'random_state': 0}
0.564 (+/-0.037) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 200, 'random_state': 1}
0.579 (+/-0.043) for {'criterion': 'gini', 'max_depth': 1, 'n_es

## Test

In [32]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(criterion= 'entropy', max_depth= 20, n_estimators= 200, random_state= 1)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

score = accuracy_score(y_test, pred)
print(f"Normal Nearest Neighbor: {score:.3f}")
cm = confusion_matrix(y_test, pred)
print(cm)

Normal Nearest Neighbor: 0.775
[[797 149  54]
 [121 743 136]
 [ 55 161 784]]


# Neural Network

## Training & Tuning

In [82]:
from sklearn.neural_network import MLPClassifier

tuned_parameters = [{
                    "random_state": [10], # 0, 5, ,
                    "alpha": [0.0001], # , 0.0005, 0.00005,
                    "hidden_layer_sizes": [1, 10, 50, 100, 200, 300, 500, 1000], #[100, 200, 300], 
                    "activation":["relu"], # ["identity", "tanh", "relu"],
                    "solver":["adam"], # ["lbfgs", "adam"],
                    "learning_rate_init": [0.001]
                    }]

clf = GridSearchCV(MLPClassifier(), tuned_parameters, cv=2)

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")

print(clf.best_params_, clf.best_score_)

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters set found on development set:
{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 1000, 'learning_rate_init': 0.001, 'random_state': 10, 'solver': 'adam'} 0.7916000000000001
0.333 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 1, 'learning_rate_init': 0.001, 'random_state': 10, 'solver': 'adam'}
0.753 (+/-0.011) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 10, 'learning_rate_init': 0.001, 'random_state': 10, 'solver': 'adam'}
0.757 (+/-0.003) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 50, 'learning_rate_init': 0.001, 'random_state': 10, 'solver': 'adam'}
0.766 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 100, 'learning_rate_init': 0.001, 'random_state': 10, 'solver': 'adam'}
0.779 (+/-0.000) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 200, 'learning_rate_init': 0.001, 'random_state': 10, 'solver': 'adam'}
0.782 (+/-0.007) for {'activa

## Test

In [30]:
clf = MLPClassifier(random_state=1, hidden_layer_sizes= 50)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

score = accuracy_score(y_test, pred)
print(f"Normal Nearest Neighbor: {score:.3f}")
cm = confusion_matrix(y_test, pred)
print(cm)

Normal Nearest Neighbor: 0.792
[[826 134  40]
 [127 724 149]
 [ 38 135 827]]


# SVM

## Training & Tuning

In [26]:
from sklearn.svm import SVC

tuned_parameters = [{
                     "gamma": ['scale'],
                     "C": [ 3],
                     "kernel": ["poly", "rbf"],
                     "degree": [1, 2, 3, 4, 5, 10]
                    }]

clf = GridSearchCV(SVC(), tuned_parameters, cv=2)

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_, clf.best_score_)

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters set found on development set:
{'C': 3, 'degree': 1, 'gamma': 'scale', 'kernel': 'rbf'} 0.766
0.718 (+/-0.012) for {'C': 3, 'degree': 1, 'gamma': 'scale', 'kernel': 'poly'}
0.766 (+/-0.003) for {'C': 3, 'degree': 1, 'gamma': 'scale', 'kernel': 'rbf'}
0.750 (+/-0.015) for {'C': 3, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
0.766 (+/-0.003) for {'C': 3, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
0.750 (+/-0.009) for {'C': 3, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
0.766 (+/-0.003) for {'C': 3, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
0.746 (+/-0.001) for {'C': 3, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}
0.766 (+/-0.003) for {'C': 3, 'degree': 4, 'gamma': 'scale', 'kernel': 'rbf'}
0.733 (+/-0.005) for {'C': 3, 'degree': 5, 'gamma': 'scale', 'kernel': 'poly'}
0.766 (+/-0.003) for {'C': 3, 'degree': 5, 'gamma': 'scale', 'kernel': 'rbf'}
0.610 (+/-0.085) for {'C': 3, 'degree': 10, 'gamma': 'scale', 'kernel': 'poly'}
0.766 (+/-0.003) for {'C':

## Test

In [83]:
#clf = MLPClassifier(random_state=5, activation="relu", hidden_layer_sizes=27, solver="adam")
clf = MLPClassifier(random_state=10, hidden_layer_sizes=1000, activation="relu", alpha=0.0001)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

score = accuracy_score(y_test, pred)
print(f"Normal Nearest Neighbor: {score:.3f}")
cm = confusion_matrix(y_test, pred)
print(cm)

Normal Nearest Neighbor: 0.826
[[859 106  35]
 [121 776 103]
 [ 35 122 843]]
