# Adaptive Intelligence COM3240 Assignment A

## Principal Component Analysis on the reduced MNIST database

### Context

A reduced data set derived from the MNIST database http://yann.lecun.com/exdb/mnist/ is provided in csv format. This is also accompanied by a set of labels for the data, purely for guidance purposes.

In [None]:
# Imports & Setup

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
%matplotlib inline
print("Numpy Version:", np.version.version)
print("MatPlotLib Version:", matplotlib.__version__)

In [None]:
# Read in the data set
train = np.genfromtxt ('digits/train.csv', delimiter=",")
train_labels = np.genfromtxt('digits/trainlabels.csv', delimiter=",", dtype=int)

image_shape = (28, 28) # 28x28 pixels

[feature_vector_size, training_sample_size] = np.shape(train) # number of pixels and number of training data
print("feature_vector_size = %d pixel values" %(feature_vector_size))
print("training_sample_size = %d images" %(training_sample_size))

# Prove we've read the data correctly - reshape some input vector into the image shape and display the image
image = np.reshape(train[:,2], image_shape, order='F')
plt.imshow(image, cmap = 'inferno');

In [None]:
###
### Get the eigenvectors in order of decreasing eigenvalue
###
mean = np.mean(train, axis=1) # Take the mean of each feature for the training set
centered_data = train - mean[:, None] # Indexing is required to ensure shape is (x,1) and not (x,)
cov_m = np.cov(centered_data, rowvar=True) # Covariance Matrix!
print("Covariance Matrix Shape:", cov_m.shape) # Checkup, correct shape?

eig_vals_unsorted, eig_vecs_unsorted = np.linalg.eig(cov_m) # Calculate Eigenvectors & Eigenvalues

sorted_indices = np.argsort(eig_vals_unsorted)[::-1] # Find the indices in decreasing order of eigenvalue

eig_vals = np.real(eig_vals_unsorted[sorted_indices]) # Fetch the eigenvalues removing any complexity
eig_vecs = np.real(eig_vecs_unsorted[sorted_indices]) # Fetch the eigenvectors removing any complexity

print("\nFirst few eigenvalues in order:") # Check they are sorted correctly.
for i in range(8):
    print("[", i, "]", eig_vals[i])
    
new_data = np.dot(eig_vecs.T, centered_data) # Data in new transformed space

In [None]:
###
### Plot the results comparing principal components
###

# Setup a dictionary of colors!
colors = dict({0: '#70bf4fd0', 1: '#c750bbd0', 2: '#6286aed0', 3: '#74bb9cd0', 4: '#634fb1d0', 5: '#c7593cd0', 6: '#c6aa4fd0', 7: '#963b5bd0', 8: '#5c5d36d0', 9: '#cf96b6d0'})

# Define combinations of principal components to display
plot_combos = [[1,2,3], [2,3,4], [1,3,4], [2,3,5], [3,4,5]]

# Lets only compare a few numbers
number_whitelist = [0, 1, 6, 7, 4]
# number_whitelist = np.arange(10)

fig = plt.figure(figsize=(12,7))
cols = 3
rows = (len(plot_combos) // cols) + 1 # Calculate req'd number of rows

# Create set of labels for legend
patches = [matplotlib.patches.Patch(color=color, label=num) for num, color in colors.items()]

for i, (first, second, third) in enumerate(plot_combos):
    # For each combination of components...
    ax = fig.add_subplot(rows, cols, i + 1, projection='3d')
    
    # Setup axis
    ax.grid(False)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_zticks([])
    ax.set_xlabel('PC' + str(first))
    ax.set_ylabel('PC' + str(second))
    ax.set_zlabel('PC' + str(third))
    ax.set_title("PCs: %d, %d, %d" %(first,second,third,))
    
    annotations = []
    
    for number in number_whitelist:
        # Get data for number
        indicies = np.where(train_labels == number)[0]
        classified_data = new_data[:, indicies]
        
        # Annotate cluster center
        mean = classified_data.mean(axis=1)
        x, y, _ = proj3d.proj_transform(mean[first - 1], mean[second - 1], mean[third - 1], ax.get_proj())
        annotations.append((number, (mean[first - 1], mean[second - 1], mean[third - 1])))
        
        # Plot cluster of number
        ax.scatter(new_data[first - 1, indicies], new_data[second - 1, indicies], new_data[third - 1, indicies], c=colors[number], depthshade=True, alpha=0.6, s=5)
        
    for label, pos_3d in annotations:
        # Add all annotations (cluster centers)
        x, y, _ = proj3d.proj_transform(*pos_3d, ax.get_proj()) # Transform 3d point to 2d screen space
        a = ax.annotate(label, 
                 [x, y],
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=10, weight='black',
                 color='#444444ee',
                 backgroundcolor='#ffffffee')
        a.set_zorder(10)
    
plt.legend(title='Legend', handles=patches,bbox_to_anchor=(0.9, 0.38),bbox_transform=plt.gcf().transFigure, ncol=2)

plt.tight_layout()
plt.show()