In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pandas as pd

1. Support vector machines using rbf-kernels perform very well on the MNIST dataset. By tuning your
parameters you should be able to get over 95% test accuracy. So, the first part of this exercise is to find C
and gamma to obtain that kind of scores. You may use a smaller part of MNIST for training and still obtain
good scores. Recall that the hyperparameters have to be found without laying your hands on the test set,
i.e. use either cross-validation, a validation set or some other technique to distinguish between different
models. Report in your code as comments, or in a separate document, the grid (or whatever technique for
hyperparameter search your are using) which was searched and the resulting best hyperparameters.


In [2]:
mnist = fetch_openml('mnist_784', version=1)

In [3]:
X, y = mnist.data.values, mnist.target.values
random = np.random.randint(1, 1001)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=60000, random_state=random)
X_train_sample_small, _, y_train_sample_small, _ = train_test_split(X_train, y_train, train_size=2000, random_state=random)
scaler = StandardScaler()
X_train_sample_small_scaled = scaler.fit_transform(X_train_sample_small)
# take a sample of 10000 points from X_train and y_train

In [5]:
clf = SVC()
param_grid = [{
    "kernel": ["rbf"],
    "C": [4.89 , 4.92, 5, 5.5, 6],
    "gamma": [0.001, 0.0008, 0.0009]
}]


grid_search = GridSearchCV(clf, param_grid, cv=3, n_jobs=-1, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train_sample_small_scaled, y_train_sample_small)

GridSearchCV(cv=3, estimator=SVC(), n_jobs=-1,
             param_grid=[{'C': [4.89, 4.92, 5, 5.5, 6],
                          'gamma': [0.001, 0.0008, 0.0009],
                          'kernel': ['rbf']}],
             return_train_score=True, scoring='accuracy')

After running the grid search multiple times, I have narrowed down the best parameters for my model. All parameters give similar results, so I believe I will have to manually fine tune them to get the best results.

In [6]:
# printing all results
grid_search_results = grid_search.cv_results_
df = pd.DataFrame(grid_search_results)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.571102,0.097506,1.442512,0.16527,4.89,0.001,rbf,"{'C': 4.89, 'gamma': 0.001, 'kernel': 'rbf'}",0.89955,0.89955,0.927928,0.909009,0.013377,12,0.9985,0.996999,0.99925,0.99825,0.000936
1,0.5441,0.017175,1.435506,0.059097,4.89,0.0008,rbf,"{'C': 4.89, 'gamma': 0.0008, 'kernel': 'rbf'}",0.907046,0.89955,0.929429,0.912009,0.012693,3,0.996249,0.995499,0.998501,0.99675,0.001276
2,0.981664,0.030134,2.477553,0.101715,4.89,0.0009,rbf,"{'C': 4.89, 'gamma': 0.0009, 'kernel': 'rbf'}",0.902549,0.895052,0.929429,0.90901,0.014759,9,0.9985,0.995499,0.998501,0.9975,0.001415
3,0.847322,0.125826,1.852221,0.411925,4.92,0.001,rbf,"{'C': 4.92, 'gamma': 0.001, 'kernel': 'rbf'}",0.89955,0.89955,0.927928,0.909009,0.013377,12,0.9985,0.996999,0.99925,0.99825,0.000936
4,0.9935,0.582772,1.710542,0.4787,4.92,0.0008,rbf,"{'C': 4.92, 'gamma': 0.0008, 'kernel': 'rbf'}",0.907046,0.89955,0.929429,0.912009,0.012693,3,0.996249,0.995499,0.998501,0.99675,0.001276
5,0.620441,0.214917,1.589848,0.60468,4.92,0.0009,rbf,"{'C': 4.92, 'gamma': 0.0009, 'kernel': 'rbf'}",0.902549,0.895052,0.929429,0.90901,0.014759,9,0.9985,0.995499,0.998501,0.9975,0.001415
6,0.53002,0.043287,1.500848,0.102906,5.0,0.001,rbf,"{'C': 5, 'gamma': 0.001, 'kernel': 'rbf'}",0.89955,0.89955,0.926426,0.908509,0.01267,15,0.9985,0.997749,0.99925,0.9985,0.000613
7,0.697005,0.160517,1.532127,0.027471,5.0,0.0008,rbf,"{'C': 5, 'gamma': 0.0008, 'kernel': 'rbf'}",0.907046,0.89955,0.929429,0.912009,0.012693,3,0.996249,0.995499,0.998501,0.99675,0.001276
8,1.313558,0.611022,1.395386,0.117386,5.0,0.0009,rbf,"{'C': 5, 'gamma': 0.0009, 'kernel': 'rbf'}",0.902549,0.895052,0.929429,0.90901,0.014759,9,0.9985,0.995499,0.998501,0.9975,0.001415
9,0.565885,0.071132,1.657959,0.043765,5.5,0.001,rbf,"{'C': 5.5, 'gamma': 0.001, 'kernel': 'rbf'}",0.901049,0.901049,0.926426,0.909508,0.011963,7,0.9985,0.997749,1.0,0.99875,0.000936


In [5]:
random = np.random.randint(1, 1001)
X_train_sample_large, _, y_train_sample_large, _ = train_test_split(X_train, y_train, train_size=10000, random_state=random)
X_train_sample_large_scaled = scaler.fit_transform(X_train_sample_large)
clf_best = SVC(C=4.92, gamma=0.001, kernel='rbf')
clf_best.fit(X_train_sample_large_scaled, y_train_sample_large)
test_score = clf_best.score(scaler.transform(X_test), y_test)
print("Test Accuracy: {:.2f}%".format(test_score * 100))

Test Accuracy: 94.92%


- After some manual fine-tuning of the gamma and C regularization parameter, I was able to achieve a test accuracy of around 95% on the test set. It is sometimes higher, sometimes lower, but it is always around 95%.
- However, the score is accurate when I consider a sample of 10000 points instead of 2000, like I previously defined for grid search. If I use 2000 points, the accuracy is around 90%. I will use 10000 points for the rest of the exercise. The reason why I used 2000 points for grid search is because it is faster to run.
- The best parameters for my model seems to be C = 4.92 and gamma = 0.001

2. The second part of this exercise is to compare the built-in binarization scheme used for the SVC class,
namely one-vs-one, against the one-vs-all scheme, which was discussed in Lecture 5. You should implement
your own version of one-vs-all SVM and compare your results against the built-in version. To make the
comparison simple you should keep the same hyperparameters which you found in the first part of this
exercise. Which was the best classifier? If studying the confusion matrix was there any apparent difference
between the two methods in terms of misclassifications?

In [6]:
best_C = 4.92
best_gamma = 0.001

In [7]:
import Functions as f
my_one_vs_all = f.one_vs_all(X_train_sample_large_scaled, y_train_sample_large, scaler.transform(X_test), C=best_C, gamma=best_gamma)

In [30]:
from sklearn.metrics import accuracy_score, precision_score
my_one_vs_all_accuracy = []
my_one_vs_all_precision = []
for number in range(10):
    test_comparison = np.where(y_test == number, 1, 0)
    accuracy = accuracy_score(test_comparison, my_one_vs_all[number])
    my_one_vs_all_accuracy.append(accuracy)


In [44]:
my_avg_acc = np.mean(my_one_vs_all_accuracy)
precisions = []
for number in range (10):
    y_predicted = my_one_vs_all[number]
    y_true = (y_test == str(number)).astype(int)
    precision = precision_score(y_true, y_predicted)
    precisions.append(precision)
my_avg_precision = np.mean(precisions)
print(f"My implementation of one_vs_all, accuracy: {my_avg_acc}, precision: {round(my_avg_precision, 5)}")

My implementation of one_vs_all, accuracy: 0.90527, precision: 0.96865


In [58]:
clf_predictions = clf_best.predict(scaler.transform(X_test))

In [59]:
clf_acc = accuracy_score(y_test, clf_predictions)
clf_precision = precision_score(y_test, clf_predictions, average='micro')
print(f"Built in one_vs_one, accuracy:{clf_acc}, precision {clf_precision}")

Built in one_vs_one, accuracy:0.9492, precision 0.9492


- My implementation of one_vs_all results in a lower accuracy, but in a higher precision comparing to the built in one_vs_one.

- Now constructing confusion matrix for both classifiers. First for my implementation of one_vs_all.

In [57]:
from sklearn.metrics import confusion_matrix
sum_of_true_values = 0
for number in range(10):
    y_true = (y_test == str(number)).astype(int)
    y_pred = my_one_vs_all[number]
    c_matrix = confusion_matrix(y_true, y_pred)
    sum_of_true_values += c_matrix[1, 1]
    print(f"Number {number} confusion matrix:")
    print(c_matrix)
print(f"Number of correctly predicted values: {sum_of_true_values}")

Number 0 confusion matrix:
[[8942   16]
 [  41 1001]]
Number 1 confusion matrix:
[[8903   20]
 [  24 1053]]
Number 2 confusion matrix:
[[8989   37]
 [  88  886]]
Number 3 confusion matrix:
[[8961   26]
 [ 116  897]]
Number 4 confusion matrix:
[[8946   40]
 [  61  953]]
Number 5 confusion matrix:
[[9062   20]
 [ 101  817]]
Number 6 confusion matrix:
[[9010   27]
 [  57  906]]
Number 7 confusion matrix:
[[8899   25]
 [  87  989]]
Number 8 confusion matrix:
[[9055   31]
 [ 127  787]]
Number 9 confusion matrix:
[[8939   52]
 [ 119  890]]
Number of correctly predicted values: 9179


- Now for the built in one_vs_one

In [56]:
from sklearn.metrics import confusion_matrix
clf_cm = confusion_matrix(y_test, clf_predictions)
diagonal_sum = np.diagonal(clf_cm).sum()
print(clf_cm)
print(f"Number of correctly predicted values: {diagonal_sum}")

[[1020    0    2    0    0    6    8    3    3    0]
 [   0 1061    4    1    2    0    0    9    0    0]
 [   3    5  925   10    3    1    4   16    4    3]
 [   0    1   26  937    3   14    0   17   12    3]
 [   1    5   13    0  966    1    3   12    0   13]
 [   5    1    4   22    5  849   11   12    5    4]
 [   6    0    0    0    3    9  927   15    3    0]
 [   3    7   12    0    8    1    0 1032    0   13]
 [   6   12   10    9    2   22    3    8  838    4]
 [   9    3    6    4   24    1    1   19    5  937]]
Number of correctly predicted values: 9492


- The confusion matrices for one_vs_all and built in one_vs_one are both performing correctly in the terms of prediction. However, the built in one_vs_one is performing better, as it has a higher accuracy. There are more correctly predicted values in the built in one_vs_one. I am not entirely sure why, but I assume it is because of the way the built in one_vs_one is implemented. There could be some differences in the way the data is split into classes. Or the decision boundary created by the built in one_vs_one is better than the one created by my implementation of one_vs_all.
- One more thing that I notice is that my implementation of one_vs_all is performing better in the terms of precision. This is because the precision is calculated as the number of true positives divided by the number of true positives plus the number of false positives. The number of false positives is lower in my implementation of one_vs_all, because the number of false positives is calculated for each class separately, and then the average is taken. In the built in one_vs_one, the number of false positives is calculated for all classes together, and then the average is taken. This means that the number of false positives is higher in the built in one_vs_one, and therefore the precision is lower.