In [11]:
from keras.datasets import mnist
import numpy as np 
import matplotlib.pyplot as plt
# load dataset
(x, y), (x_test, y_test) = mnist.load_data(path='mnist.npz')
n_clusters = len(np.unique(y))
x = np.reshape(x, [x.shape[0], x.shape[1] * x.shape[2]])
x_test = np.reshape(x_test, [x_test.shape[0], x_test.shape[1] * x_test.shape[2]])
x_test = x_test/255
x = x/255

In [13]:
# convert each image to 1 dimensional array
x = x.reshape(len(x),-1)

# normalize the data to 0 - 1
x = x.astype(float) / 255.

print(x.shape)
print(x[0].shape)

(60000, 784)
(784,)


In [14]:
from sklearn.cluster import MiniBatchKMeans

n_digits = len(np.unique(y_test))
print(n_digits)

# Initialize KMeans model
kmeans = MiniBatchKMeans(n_clusters = n_digits)

# Fit the model to the training data
kmeans.fit(x)


10




In [15]:
def infer_cluster_labels(kmeans, actual_labels):
    """
    Associates most probable label with each cluster in KMeans model
    returns: dictionary of clusters assigned to each label
    """

    inferred_labels = {}

    for i in range(kmeans.n_clusters):

        # find index of points in cluster
        labels = []
        index = np.where(kmeans.labels_ == i)

        # append actual labels for each point in cluster
        labels.append(actual_labels[index])

        # determine most common label
        if len(labels[0]) == 1:
            counts = np.bincount(labels[0])
        else:
            counts = np.bincount(np.squeeze(labels))

        # assign the cluster to a value in the inferred_labels dictionary
        if np.argmax(counts) in inferred_labels:
            # append the new number to the existing array at this slot
            inferred_labels[np.argmax(counts)].append(i)
        else:
            # create a new array in this slot
            inferred_labels[np.argmax(counts)] = [i]

        #print(labels)
        #print('Cluster: {}, label: {}'.format(i, np.argmax(counts)))
        
    return inferred_labels  

def infer_data_labels(X_labels, cluster_labels):
    """
    Determines label for each array, depending on the cluster it has been assigned to.
    returns: predicted labels for each array
    """
    
    # empty array of len(X)
    predicted_labels = np.zeros(len(X_labels)).astype(np.uint8)
    
    for i, cluster in enumerate(X_labels):
        for key, value in cluster_labels.items():
            if cluster in value:
                predicted_labels[i] = key
                
    return predicted_labels

In [17]:
from sklearn.metrics import accuracy_score
# test the infer_cluster_labels() and infer_data_labels() functions
cluster_labels = infer_cluster_labels(kmeans, y)
X_clusters = kmeans.predict(x)
predicted_labels = infer_data_labels(X_clusters, cluster_labels)
print('Accuracy: {}\n'.format(accuracy_score(y, predicted_labels)))
print (predicted_labels[:20])
print (y[:20])

Accuracy: 0.5706333333333333

[8 0 4 1 7 2 1 8 1 7 3 1 3 6 1 7 2 7 6 7]
[5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9]


with X_test

In [18]:
from sklearn.metrics import accuracy_score
# test kmeans algorithm on testing dataset
# convert each image to 1 dimensional array
x_test = x_test.reshape(len(x_test),-1)

# normalize the data to 0 - 1
x_test = x_test.astype(float) / 255.

# initialize and fit KMeans algorithm on training data
kmeans = MiniBatchKMeans(n_clusters = 10)
kmeans.fit(x_test)
cluster_labels = infer_cluster_labels(kmeans, y_test)

# predict labels for testing data
test_clusters = kmeans.predict(x_test)
predicted_labels = infer_data_labels(kmeans.predict(x_test), cluster_labels)
    
# calculate and print accuracy
print('Accuracy: {}\n'.format(accuracy_score(y_test, predicted_labels)))



Accuracy: 0.5699



In [19]:
from sklearn.metrics import normalized_mutual_info_score

normalized_mutual_info_score(y_test, predicted_labels)

0.4932350087628642

In [20]:
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(x_test, predicted_labels)
print("The average silhouette_score is :", silhouette_avg)

The average silhouette_score is : 0.07415776247806752
