PROBLEM 4 : PCA for cluster visualization

In [4]:
import numpy as np
import keras
import random
from keras.datasets import mnist
from sklearn.model_selection import train_test_split

(train_images, train_labels), (test_images, test_labels) = keras.datasets.mnist.load_data()

#reshaping images
train_images = np.reshape(train_images, (-1, 784))
test_images = np.reshape(test_images, (-1, 784))

# normalize
train_images = train_images.astype('float32') / 255
test_images = test_images.astype('float32') / 255

random_sample_indices = random.sample(range(train_images.shape[0]), 20000)
train_images_25 = train_images[random_sample_indices]
train_labels_25 = train_labels[random_sample_indices]


train_images_final_80, validation_images_final_10, train_labels_final_80,validation_labels_final_10 = train_test_split(train_images_25, train_labels_25, test_size=0.1, random_state=42)

print("Final train dataset size: ", train_images_final_80.shape)
print("Final validation dataset size: ", validation_images_final_10.shape)

Final train dataset size:  (18000, 784)
Final validation dataset size:  (2000, 784)


A) Run KMeans on MNIST data (or a sample of it)

In [13]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(train_images_final_80)

labels = kmeans.labels_
centers = kmeans.cluster_centers_

score = accuracy_score(train_labels_final_80, labels)

print('Accuracy Score: %.3f' % score)





Accuracy Score: 0.122


B) Run PCA on same data\
C) Plot data in 3D with PCA representation with t=3 top eigen values; use shapes to to indicate truth digit label (circle, triangle, "+", stars, etc) and colors to indicate cluster ID (red blue green etc).

In [14]:

from sklearn.decomposition import PCA
import plotly.graph_objects as go

pca = PCA(n_components=3)
projection_train = pca.fit_transform(train_images_final_80)
print("evalue: ", pca.explained_variance_)

kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(projection_train)

labels = kmeans.labels_
centers = kmeans.cluster_centers_

score = accuracy_score(train_labels_final_80, labels)

print('Accuracy Score: %.3f' % score)

cluster_points = dict()
for i in range(len(labels)):
    label = labels[i]
    point = projection_train[i]

    if label in cluster_points:
        cluster_points[label].append(point)
    else:
        cluster_points[label] = []

cluster_0 = np.array(cluster_points[0])
cluster_1 = np.array(cluster_points[1])
cluster_2 = np.array(cluster_points[2])
cluster_3 = np.array(cluster_points[3])
cluster_4 = np.array(cluster_points[4])
cluster_5 = np.array(cluster_points[5])
cluster_6 = np.array(cluster_points[6])
cluster_7 = np.array(cluster_points[7])
cluster_8 = np.array(cluster_points[8])
cluster_9 = np.array(cluster_points[9])

x0 = cluster_0[:, 0]
y0 = cluster_0[:, 1]
z0 = cluster_0[:, 2]

x1 = cluster_1[:, 0]
y1 = cluster_1[:, 1]
z1 = cluster_1[:, 2]

x2 = cluster_2[:, 0]
y2 = cluster_2[:, 1]
z2 = cluster_2[:, 2]

x3 = cluster_3[:, 0]
y3 = cluster_3[:, 1]
z3 = cluster_3[:, 2]

x4 = cluster_4[:, 0]
y4 = cluster_4[:, 1]
z4 = cluster_4[:, 2]

x5 = cluster_5[:, 0]
y5 = cluster_5[:, 1]
z5 = cluster_5[:, 2]

x6 = cluster_6[:, 0]
y6 = cluster_6[:, 1]
z6 = cluster_6[:, 2]

x7 = cluster_7[:, 0]
y7 = cluster_7[:, 1]
z7 = cluster_7[:, 2]

x8 = cluster_8[:, 0]
y8 = cluster_8[:, 1]
z8 = cluster_8[:, 2]

x9 = cluster_9[:, 0]
y9 = cluster_9[:, 1]
z9 = cluster_9[:, 2]


shapes = ['circle', 'circle-open', 'cross', 'diamond','diamond-open', 'square', 'square-open', 'x']

Scene = dict(xaxis = dict(title  = 'PC1'),yaxis = dict(title  = 'PC2'),zaxis = dict(title  = 'PC3'))

trace0 = go.Scatter3d(x=x0, y=y0, z=z0, mode='markers',marker=dict(color = "red", symbol=shapes[0], size= 5, line=dict(color= 'black',width = 10)))
trace1 = go.Scatter3d(x=x1, y=y1, z=z1, mode='markers',marker=dict(color = "blue", symbol=shapes[1], size= 5, line=dict(color= 'black',width = 10)))
trace2 = go.Scatter3d(x=x2, y=y2, z=z2, mode='markers',marker=dict(color = "green", symbol=shapes[2], size= 5, line=dict(color= 'black',width = 10)))
trace3 = go.Scatter3d(x=x3, y=y3, z=z3, mode='markers',marker=dict(color = "orange", symbol=shapes[3], size= 5, line=dict(color= 'black',width = 10)))
trace4 = go.Scatter3d(x=x4, y=y4, z=z4, mode='markers',marker=dict(color = "yellow", symbol=shapes[4], size= 5, line=dict(color= 'black',width = 10)))
trace5 = go.Scatter3d(x=x5, y=y5, z=z5, mode='markers',marker=dict(color = "purple", symbol=shapes[5], size= 5, line=dict(color= 'black',width = 10)))
trace6 = go.Scatter3d(x=x6, y=y6, z=z6, mode='markers',marker=dict(color = "pink", symbol=shapes[6], size= 5, line=dict(color= 'black',width = 10)))
trace7 = go.Scatter3d(x=x7, y=y7, z=z7, mode='markers',marker=dict(color = "cyan", symbol=shapes[7], size= 5, line=dict(color= 'black',width = 10)))
trace8 = go.Scatter3d(x=x8, y=y8, z=z8, mode='markers',marker=dict(color = "gray", symbol=shapes[0], size= 5, line=dict(color= 'black',width = 10)))
trace9 = go.Scatter3d(x=x9, y=y9, z=z9, mode='markers',marker=dict(color = "lightgreen", symbol=shapes[1], size= 5, line=dict(color= 'black',width = 10)))

layout = go.Layout(margin=dict(l=0,r=0,t=0,b=0),scene = Scene,height = 500,width = 600)
data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7, trace8, trace9]
fig = go.Figure(data = data, layout = layout)
fig.show()

evalue:  [5.143525  3.795842  3.2821958]






Accuracy Score: 0.044


D) Select other 3 at random eigen values from top 20; redo the plot several times.

In [17]:
pca_20 = PCA(n_components=20)
projection_train = pca_20.fit_transform(train_images_final_80)

rand_eigenvalues_indices = np.random.choice(np.arange(20), 3, replace=False)

rand_3_eigenvalues = pca_20.explained_variance_[rand_eigenvalues_indices]
rand_3_eigenvectors = pca_20.components_[rand_eigenvalues_indices]

pca_20 = PCA(n_components=3)
projection_train_rand3_eigenvalues = pca_20.fit(train_images_final_80)
pca_20.explained_variance_ = rand_3_eigenvalues
pca_20.components_ = rand_3_eigenvectors
projection_train_rand3_eigenvalues = pca_20.transform(train_images_final_80)




In [18]:
x = projection_train_rand3_eigenvalues[:, 0]
y = projection_train_rand3_eigenvalues[:, 1]
z = projection_train_rand3_eigenvalues[:, 2]


shapes = ['circle', 'circle-open', 'cross', 'diamond','diamond-open', 'square', 'square-open', 'x']

Scene = dict(xaxis = dict(title  = 'PC1'),yaxis = dict(title  = 'PC2'),zaxis = dict(title  = 'PC3'))

trace = go.Scatter3d(x=x, y=y, z=z, mode='markers',marker=dict(color = labels, size= 5, line=dict(color= 'black',width = 10)))
layout = go.Layout(margin=dict(l=0,r=0,t=0,b=0),scene = Scene,height = 500,width = 600)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()