In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_context('notebook')
sns.set_style('darkgrid')

---
# Classifying handwritten digits

![](https://scikit-learn.org/stable/_images/sphx_glr_plot_digits_classification_001.png)

<br>

<big>

## Load the data

---

- There are 1797 images of handwritten digits of 8x8 pixels each    
- The dataset has 1797 rows and 64 columns
    - Each row is an image, each column is a pixel

<br>

In [None]:
from sklearn import datasets
digits = datasets.load_digits()

print(digits.keys())
print(digits.images.shape)
print(digits.data.shape)

<br>

<big>

---
## Inspect the data

- An 8x8 pixel image can be displayed using matplotlib's `imshow`

<br>

In [None]:
plt.imshow(digits.images[120]);

In [None]:
fig, axes = plt.subplots(10, 10, figsize=(8, 8))
fig.subplots_adjust(hspace=0.1, wspace=0.1)

for i, ax in enumerate(axes.flat):
    ax.imshow(digits.images[i], cmap='binary')
    ax.text(0.05, 0.05, str(digits.target[i]),
            transform=ax.transAxes, color='green')
    ax.set_xticks([])
    ax.set_yticks([])

In [None]:
digits_df = pd.DataFrame(digits.data, 
                         columns=["Pixel_" + str(i) for i in range(64)])

In [None]:
digits_df[:5]

In [None]:
sns.heatmap(digits_df.corr());

<br>

<big>

## Let's build a model

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
# The Dependent Variable

import numpy as np
np.unique(digits.target)

<br>

<big>
    
## This is a Multiclass Classification Problem since the Dependent Variable has 10 classes

<br>

## Step 1: Visualize the Classification Boundary using PCA
---

<br>

In [None]:
from sklearn.preprocessing import StandardScaler
X_digits = StandardScaler().fit_transform(digits.data)

In [None]:
from sklearn.decomposition import PCA
pca_2 = PCA(n_components=2)
data_2comp = pca_2.fit_transform(X_digits)

In [None]:
evr = pca_2.explained_variance_ratio_.sum()
print(f"The Explained Variance is: {evr:.1%}")

In [None]:
pd.DataFrame(data_2comp, 
          columns=['PC1', 'PC2']).plot.scatter(x='PC1', y='PC2', 
                                               c=digits.target, cmap='Spectral', 
                                               xticks=[], yticks=[]);

In [None]:
pd.Series({n:PCA(n_components=n).fit(X_digits).explained_variance_ratio_.sum() for n in range(2, 60, 2)}, name='Explained Variance').plot();

In [None]:
# Selecting the 25 component solution that explains 85% of the variance.

X_pca = PCA(n_components=25).fit_transform(X_digits)

In [None]:
pd.DataFrame(X_pca)

---

## Dimensionality Reduction with `Isomap`

- We'd like to visualize our points within the **64-dimensional parameter space**, but it's difficult to plot points in 64 dimensions!

- Instead we'll reduce the dimensions to 2, using an unsupervised method.

- Here, we'll make use of a **manifold learning algorithm** called *Isomap*, and transform the data to two dimensions.

- This is a critical step because it will lead to the selection of a few candidate algorithms that would possess the ability to discover the classfication boundary for this data.

In [None]:
from sklearn.manifold import Isomap, TSNE

In [None]:
iso = Isomap(n_components=2, n_neighbors=10)
data_projected = iso.fit_transform(digits.data)
data_projected.shape

In [None]:
# Visualize the data in 2 dimensions

(pd.DataFrame(data_projected, columns=['x1', 'x2'])
 .plot
 .scatter(x='x1', y='x2', 
          c=digits.target, 
          cmap='RdYlBu'));

In [None]:
tsne = TSNE(n_components=2)
data_projected = tsne.fit_transform(digits.data)
data_projected.shape

In [None]:
# Visualize the data in 2 dimensions

(pd.DataFrame(data_projected, columns=['x1', 'x2'])
 .plot
 .scatter(x='x1', y='x2', 
          c=digits.target, 
          cmap='RdYlBu'));

<br>

<big>

We see here that the digits are fairly well-separated in the parameter space; this tells us that a supervised classification algorithm should perform fairly well. 
Let's give it a try.

---

In [None]:
# Data Splitting
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(digits.data, 
                                                digits.target, 
                                                random_state=2)

In [None]:
round(Xtrain.shape[0]/1797.0, 2)

---
### `Logistic Regression` with Default Hyperparameters

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(digits.data, digits.target, train_size=0.8)

clf = LogisticRegression(max_iter=300)
clf.fit(Xtrain, Ytrain)

In [None]:
# Training Accuracy
round(accuracy_score(Ytrain, clf.predict(Xtrain)), 3)

In [None]:
# Test Accuracy
round(accuracy_score(Ytest, clf.predict(Xtest)), 3)

In [None]:
pd.Series(cross_val_score(clf, Xtest, Ytest, scoring='accuracy', cv=10)).describe()

In [None]:
pd.DataFrame(data=confusion_matrix(Ytest, clf.predict(Xtest)), index=[f"Actual_{x}" for x in range(10)], columns=[f"Predicted_{x}" for x in range(10)])

> Binary Classfication Problem: Performance Metrics on the Confusion Matrix

- http://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/
- https://en.wikipedia.org/wiki/Confusion_matrix

In [None]:
print(classification_report(Ytest, clf.predict(Xtest)))

---
## Task 2

RE RUN THE DIGITS CLASSIFICATION USING SVM and report the accuracy for different settings of the `kernel` parameter.
'linear', 'rbf', 'poly'

---

In [None]:
from sklearn.svm import SVC

for K in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = SVC(kernel = K)
    clf.fit(Xtrain, Ytrain)
    print('For Kernel: ', K, ' the accuracy is: ', cross_val_score(clf, Xtest, Ytest, cv=10, scoring='accuracy').mean())

In [None]:
from sklearn.neighbors import KNeighborsClassifier

for n in range(5, 26, 5):
    knn_obj = KNeighborsClassifier(n_neighbors=n)
    knn_obj.fit(Xtrain, Ytrain)
    ypred = knn_obj.predict(Xtest)
    print('For {} neighbors, the accuracy is: {}'.format(n, 
                                                       accuracy_score(Ytest, ypred)))

---
## Using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [None]:
# Set the parameters by cross-validation
par_grid = {'kernel': ['poly', 'linear'], 
            'gamma': [1e-3, 1e-4], 
            'C': [1, 10, 100, 1000]}

gscv_svc = GridSearchCV(estimator=SVC(), 
                        param_grid=par_grid, 
                        cv=5, 
                        scoring='accuracy', 
                        verbose=True)

gscv_svc.fit(Xtrain, Ytrain)

In [None]:
pd.DataFrame(gscv_svc.cv_results_).set_index('params').loc[:, 'mean_test_score':]

In [None]:
gscv_svc.best_params_

In [None]:
gscv_svc.best_score_

In [None]:
gscv_svc.best_estimator_

In [None]:
print(classification_report(Ytest, gscv_svc.best_estimator_.predict(Xtest)))

In [None]:
accuracy_score(Ytrain, gscv_svc.best_estimator_.predict(Xtrain))

# An underfitted model would have a LOW TRAINING ACCURACY
# Therefore, our best model as found by GSCV is a good model that doesn't underfit.

In [None]:
accuracy_score(Ytest, gscv_svc.best_estimator_.predict(Xtest))

# An overfitted model would have much LOWER TEST ACCURACY as compared with TRAIN ACCURACY
# Here, we see that both Train and Test accuracy is high
# So our model is not Overfitting.