In [1]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


In [2]:
#Data reading and splitting.
mnist_data = pd.read_csv('mnist.csv').values
labels = mnist_data[:, 0]
digits = mnist_data[:, 1:]

#random_state ensures that the split data is the same for each program iteration.
x_train, x_test, y_train, y_test = train_test_split(digits, labels, test_size=0.88, random_state=42)

<h3>Regularized multinomial logit model (using the LASSO penalty).</h3>

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

The logistic regression classifier by default applies L2 (Ridge) regularisation using the lbfgs solver. The liblinear and the saga solver, support both L1 (Lasso) and L2 regularisation. Liblinear is used for one vs rest schemes, which is appropriate for digit classification, and yields an accuracy score of 100% on the training set. The saga solver performed slightley worse, with an accuracy of 99% on the training set. When we however, apply cross validation (5-fold) the sage solver outperforms the liblinear classifier with 88% mean accuracy vs 0.84%. 

In a first round of hyperparameter tuning we used Cross Validation Grid search to apply 5-fold cross validation to all possible combinations of chosen parameters. The first round we tested five possible values for C [0.1,0.5,1,1.5,2] with the saga solver over 25 fits. The best fit was found at C=0.5 

In a second round, we added the solver as a hyperparameter with two options liblinear and saga and tested the following C values [0.3,0.4,0.5,0.6,0.7]. Where the best fit was present at 0.3 with the saga solver. (total of 50 fits) Yielding a crossvalidation accuracy of 88.6%

In a third round we tested the C-values [0.1,0.15,0.2,0.25,0.3,0.35] Here C=2 yielded the best result, which was rounded to 88.65, marginally better than for C=3. So we stopped tuning. And chose parameters C=0.2 and solver=saga.

In [None]:
logitClf = LogisticRegression(random_state=0, penalty="l1", solver="saga").fit(x_train,y_train)
logitPred = logitClf.predict(x_test)


In [None]:
logitClfLib = LogisticRegression(random_state=0, penalty="l1", solver="liblinear").fit(x_train,y_train)

In [None]:
logitClf.score(x_train,y_train)

In [None]:
logitClfLib.score(x_train,y_train)

In [None]:
cm = confusion_matrix(y_test, logitPred, labels=[0,1,2,3,4,5,6,7,8,9])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4,5,6,7,8,9])
disp.plot()
plt.title("Logistic Regression, saga solver with L1 regularisation, digit features")
plt.show()

<h3>K-Fold Cross Validation for Hyperparameter Tuning</h3>


In [3]:
skf = StratifiedKFold(n_splits=5, shuffle= True, random_state= 17)

In [None]:

val_scores = cross_val_score(estimator= logitClf,  X= x_train, y= y_train, cv= skf)
val_scores.mean()

In [None]:
val_scores2 = cross_val_score(estimator= logitClfLib,  X= x_train, y= y_train, cv= skf)
val_scores2.mean()

In [None]:

#test 1: [0.1,0.5,1,1.5,2] result C:0.5 on saga solver
#test 2 params = {'C':  [0.3,0.4,0.5,0.6,0.7],'solver':["saga",'liblinear']}

params = {'C':  [0.1,0.15,0.2,0.25,0.3,0.35]}

best_logitComposition = GridSearchCV(estimator=logitClf, param_grid=params,
                              cv= skf, verbose= True, n_jobs=-1)

best_logitComposition.fit(x_train, y_train)

In [None]:
best_logitComposition.best_params_

In [None]:
best_logitComposition.best_score_

<h3>SVM Hyperparameter tuning</h3>
Initial test (pre tuning), accuracy on test-data 0.11, accuracy on training data 1. -> A lot of overfitting.

5 fold cross validation
Test 1: #Test1: params = {'C':[0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,1,1.5,2]} best C=2, cv accuracy: 0.886

Test 2: {'C':[1.5,1.7,1.9,2,2.2,2.4,2.5,2.8,3.0]}        best {'C': 3} cv accuracy 0.955

Test 3: 

In [4]:
svmClf = SVC(gamma='scale').fit(x_train,y_train)

In [6]:
svmClf.score(x_test,y_test)

0.9495941558441559

In [9]:
#params = {'C':[0.1,0.5,1,1.5,2]}
#Test1: params = {'C':[0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,1,1.5,2]}
params = {'C':[3.0,3.5,4.0,4.5,5,5.5]}

best_svmComposition = GridSearchCV(estimator=svmClf, param_grid=params,
                              cv= skf, verbose= True, n_jobs=-1)

best_svmComposition.fit(x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [6]:
best_svmComposition.best_params_

{'C': 3.0}

In [8]:
best_svmComposition.best_score_

0.9557539682539682

<h3>Model Comparison</h3>

In the previous step we have determined the best values for hyperparameter C, which for both the LogisticRegression Classifier and the SupportVector Classifier signifies the inverse regularisation strenght, using a grid parameter search over a 5 fold cross validation. Smaller values, specify stronger regularisation. For the LogisticRegression Classifier we additionally determined the best solver.  

We continue with the following parameters:
LogisticRegression: C=0.2, solver=saga 
SVC: C=2

In [None]:
finalLogitClf = LogisticRegression(random_state=0, C=0.2, penalty="l1", solver="saga").fit(x_train,y_train)
finalLogitPred = logitClf.predict(x_test)

In [None]:
finalLogitClf.score(x_test,y_test)

In [None]:
cm = confusion_matrix(y_test, finalLogitPred, labels=[0,1,2,3,4,5,6,7,8,9])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4,5,6,7,8,9])
disp.plot()
plt.title("Logistic Regression, saga solver with L1 regularisation, C=0.2")
plt.show()

In [None]:
finalSvmClf = SVC(gamma='scale').fit(x_train,y_train)
finalSvmPred = finalSvmClf.predict(x_test)

In [None]:
finalSvmClf.score(x_test,y_test)

In [None]:
cm = confusion_matrix(y_test, finalSvmPred, labels=[0,1,2,3,4,5,6,7,8,9])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4,5,6,7,8,9])
disp.plot()
plt.title("SVC, C=0.05")
plt.show()