# Digits Image Classification

Hello GDSC members to our first mini-project, digits classification with KNN.

In [None]:
import numpy as np

import matplotlib.pyplot as plt
import matplotlib
from sklearn import datasets

In [None]:
X, y = datasets.load_digits(return_X_y=True)


# Write your code here to print the shapes of X and y and print unique values of y
##################################################################################


##################################################################################

What's the number of images in our dataset?

In [None]:
# check for nan in our data. If there are any, replace them with zeros
#################################################################################


#################################################################################

Are our classes balanced?

In [None]:
n_images = X.shape[0]

# Check if our classes are balanced by checking the distribution of our labels
##############################################################################


##############################################################################

Let's vizualize a small sample of our dataset

In [None]:
# The variable indexes is an ndarray with shape (9,) containing random integers from 0 to n_images
##################################################################################################



##################################################################################################

sample_images = X[indexes].reshape((9, 8, 8))   # reshape to get 8 x 8 images because they have been flattened
sample_labels = y[indexes]

plt.figure(figsize=(12, 12))
for i, (image, label) in enumerate(zip(sample_images, sample_labels), 1):
    plt.subplot(3, 3, i)
    plt.title(f'Label = {label}')
    plt.imshow(image, cmap='gray')
plt.show()
    

Data vizualization is hard when we have 64 features. If only there were a way to map them to only two features!

### PCA (Principal Components Analysis): unsupervised learning technique for dimensionality reduction

Principal component analysis is a popular technique for analyzing large datasets containing a high number of dimensions/features per observation, increasing the interpretability of data while preserving the maximum amount of information, and enabling the visualization of multidimensional data.

Definition source: https://en.wikipedia.org/wiki/Principal_component_analysis

![image.png](attachment:image.png)

Image source: https://towardsdatascience.com/the-mathematics-behind-principal-component-analysis-fff2d7f4b643

In [None]:
# Follow the 5 first steps of PCA to obtain the matrix W
####################################################################

####################################################################

In [None]:
base_cmap_list = list(matplotlib.cm.cmap_d.keys())
print(base_cmap_list)

In [None]:
# Thank you jakevdp! https://gist.github.com/jakevdp/91077b0cae40f8f8244a

def discrete_cmap(n, base_cmap=None):
    """Create an n-bin discrete colormap from the specified input map"""

    # The following works for string, None, or a colormap instance

    base = matplotlib.cm.get_cmap(base_cmap)
    color_list = base(np.linspace(0, 1, n))
    cmap_name = base.name + str(n)
    return matplotlib.colors.LinearSegmentedColormap.from_list(cmap_name, color_list, n)


base_cmap = 'tab10_r'    # Choose any cmap from base_cmap_list

X_transformed = X.dot(W)    # Use tha matrix W to transform the samples onto the new subspace

plt.figure(figsize=(9, 6))
plt.title('2D Vizulization of the Digits dataset using PCA')
scatter_plot = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y, cmap=discrete_cmap(10, base_cmap))
plt.colorbar(ticks=range(10))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)


# How many images are there our training set and in our testing set?
#################################################################################

#################################################################################

In [None]:
# Check if our training set and testing set have similar distributions
# If they are not similar, try changing the random_state parameter in the train_tests_split function
#################################################################################



#################################################################################


In [None]:
# We can use sklearn's KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)

model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print(f'Our model\'s accuracy is {accuracy*100:.2f}%')