In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

from lab_tools import CIFAR10, get_hog_image

dataset = CIFAR10('../../extern_data/CIFAR10/')
X_train, y_train = dataset.train['hog'][:3000], dataset.train['labels'][:3000]
X_test, y_test = dataset.test['hog'], dataset.test['labels']

Pre-loading training data
Pre-loading test data


# Ridge Classifier

## Training & Tuning

In [2]:
tuned_parameters = [{"alpha": np.arange(0, 1, 0.2)}]

clf = GridSearchCV(RidgeClassifier(), tuned_parameters, cv=2)

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_)

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))


Best parameters set found on development set:
{'alpha': 0.2}
0.690 (+/-0.027) for {'alpha': 0.0}
0.716 (+/-0.007) for {'alpha': 0.2}
0.704 (+/-0.003) for {'alpha': 0.4}
0.699 (+/-0.005) for {'alpha': 0.6000000000000001}
0.695 (+/-0.003) for {'alpha': 0.8}


## Test

In [None]:
clf = RidgeClassifier(alpha = 0.06)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, pred)
print(f"Accuracy: {accuracy:.3f}")

cm = confusion_matrix(y_test, pred)
print(cm)

# Nearest Neighbor

## Training & Tuning

In [None]:
# NORMALIZATION
X_train_NN = preprocessing.normalize(X_train, axis=0)
X_test_NN = preprocessing.normalize(X_test, axis=0)

tuned_parameters = {"n_neighbors": np.arange(1, 30, 2), "weights": ["distance"], "p": [1]}

clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=2)

#clf = KNeighborsClassifier(n_neighbors=10, weights='distance', p = 1)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_)
print()

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

## Test

In [None]:
clf = KNeighborsClassifier(n_neighbors=10, weights='distance', p = 1)

X_train_NN = preprocessing.normalize(X_train, axis=0)
X_test_NN = preprocessing.normalize(X_test, axis=0)

clf.fit(X_train_NN, y_train)

pred = clf.predict(X_test_NN)

accuracy = accuracy_score(y_test, pred)
print(f"Accuracy: {accuracy:.3f}")

cm = confusion_matrix(y_test, pred)
print(cm)

# Decision Trees

## Training & Tuning

In [20]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

tuned_parameters = [{'criterion': ['gini', 'entropy', 'log_loss'], 
                     "random_state": [0, 1, 5], 
                     'max_depth': [1, 5, 10, 15, 20],
                     'splitter': ["best", "random"],
                     'ccp_alpha': [0, 0.1, 0.5, 1, 10]
                    }]

clf = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=2)

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_)

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters set found on development set:
{'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 10, 'random_state': 0, 'splitter': 'best'}
0.426 (+/-0.013) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 0, 'splitter': 'best'}
0.425 (+/-0.040) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 0, 'splitter': 'random'}
0.426 (+/-0.013) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 1, 'splitter': 'best'}
0.413 (+/-0.001) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 1, 'splitter': 'random'}
0.426 (+/-0.013) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 5, 'splitter': 'best'}
0.448 (+/-0.006) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 1, 'random_state': 5, 'splitter': 'random'}
0.536 (+/-0.036) for {'ccp_alpha': 0, 'criterion': 'gini', 'max_depth': 5, 'random_state': 0, 'splitter': 'best'}
0.541 (+/-0.009) for {'ccp_alpha': 0, 'criterion': 'gini

## Test

In [10]:

clf = DecisionTreeClassifier(random_state=5)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

score = accuracy_score(y_test, pred)
print(f"Normal Nearest Neighbor: {score:.3f}")
cm = confusion_matrix(y_test, pred)
print(cm)


Normal Nearest Neighbor: 0.531
[[562 260 178]
 [207 525 268]
 [185 310 505]]


# Random Forest

## Training & Tuning

In [28]:
from sklearn.ensemble import RandomForestClassifier

tuned_parameters = [{
                     'n_estimators': [10, 100, 200],
                     'criterion': ['gini', 'entropy'], 
                     "random_state": [0, 1, 5], 
                     'max_depth': [1, 5, 10, 15, 20]
                    }]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2)

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_, clf.best_score_)

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters set found on development set:
{'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 200, 'random_state': 1} 0.7350000000000001
0.526 (+/-0.013) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 10, 'random_state': 0}
0.523 (+/-0.015) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 10, 'random_state': 1}
0.497 (+/-0.034) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 10, 'random_state': 5}
0.555 (+/-0.023) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 100, 'random_state': 0}
0.567 (+/-0.025) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 100, 'random_state': 1}
0.560 (+/-0.033) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 100, 'random_state': 5}
0.570 (+/-0.032) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 200, 'random_state': 0}
0.564 (+/-0.037) for {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 200, 'random_state': 1}
0.579 (+/-0.043) for {'criterion': 'gini', 'max_depth': 1, 'n_es

## Test

In [31]:
clf = RandomForestClassifier(criterion= 'entropy', max_depth= 20, n_estimators= 200, random_state= 1)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

score = accuracy_score(y_test, pred)
print(f"Normal Nearest Neighbor: {score:.3f}")
cm = confusion_matrix(y_test, pred)
print(cm)

Normal Nearest Neighbor: 0.749
[[778 163  59]
 [129 719 152]
 [ 57 192 751]]


# Neural Network

## Training & Tuning

In [7]:
from sklearn.neural_network import MLPClassifier

tuned_parameters = [{
                     "random_state": [0, 1, 5],
                   # "hidden_layer_sizes": []
                    }]

clf = GridSearchCV(MLPClassifier(), tuned_parameters, cv=2)

clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_, clf.best_score_)

means = clf.cv_results_["mean_test_score"]
stds = clf.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, clf.cv_results_["params"]):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

Best parameters set found on development set:
{'random_state': 1} 0.7243333333333333
0.722 (+/-0.019) for {'random_state': 0}
0.724 (+/-0.018) for {'random_state': 1}
0.724 (+/-0.019) for {'random_state': 5}


## Test

In [8]:
clf = MLPClassifier(random_state=1)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)

score = accuracy_score(y_test, pred)
print(f"Normal Nearest Neighbor: {score:.3f}")
cm = confusion_matrix(y_test, pred)
print(cm)

Normal Nearest Neighbor: 0.737
[[746 185  69]
 [136 691 173]
 [ 55 170 775]]
