#### import libraries

In [1]:
import matplotlib.pyplot as plt
from keras.datasets import mnist
from sklearn import metrics

#### download dataset

In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

#### print size of train and test

In [3]:
print('test Data:', (x_test.shape))
print('test Labels:', (y_test.shape))
print('Training Data:', (x_train.shape))
print('Training Labels:', (y_train.shape))

test Data: (10000, 28, 28)
test Labels: (10000,)
Training Data: (60000, 28, 28)
Training Labels: (60000,)


#### convert 28*28 image data to a vector

In [4]:
X = x_train.reshape(len(x_train),-1)
Y = y_train
X = X.astype(float) / 255.
print(X.shape)
print(X[0].shape)

(60000, 784)
(784,)


#### initialize and fit the train data

In [5]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
n_digits = len(np.unique(y_test))
print(n_digits)
kmeans = MiniBatchKMeans(n_clusters = n_digits)
kmeans.fit(X)

10


MiniBatchKMeans(n_clusters=10)

In [6]:
kmeans.labels_

array([2, 8, 3, ..., 5, 2, 7])

In [7]:
len(kmeans.labels_)

60000

#### Assigning Cluster Labels
K-means clustering is an unsupervised machine learning method; consequently, the labels assigned by our KMeans algorithm refer to the cluster each array was assigned randomly, so the labels are diffrent with the actual target integer. so we need to adjust the numbers.

In [8]:
def infer_cluster_labels(kmeans, actual_labels):
    inferred_labels = {}
    for i in range(kmeans.n_clusters):
        labels = []
        index = np.where(kmeans.labels_ == i)
        labels.append(actual_labels[index])
        if len(labels[0]) == 1:
            counts = np.bincount(labels[0])
        else:
            counts = np.bincount(np.squeeze(labels))
        if np.argmax(counts) in inferred_labels:
            inferred_labels[np.argmax(counts)].append(i)
        else:
            inferred_labels[np.argmax(counts)] = [i]
    return inferred_labels  

In [9]:
def infer_data_labels(X_labels, cluster_labels):
    predicted_labels = np.zeros(len(X_labels)).astype(np.uint8)    
    for i, cluster in enumerate(X_labels):
        for key, value in cluster_labels.items():
            if cluster in value:
                predicted_labels[i] = key
    return predicted_labels

In [10]:
cluster_labels = infer_cluster_labels(kmeans, Y)
X_clusters = kmeans.predict(X)
print(len(Y))
predicted_labels = infer_data_labels(X_clusters, cluster_labels)
print (predicted_labels[:10])
print (Y[:10])

60000
[0 0 4 1 7 2 1 8 1 7]
[5 0 4 1 9 2 1 3 1 4]


In [11]:
n_clusters=10
X_test = x_test.reshape(len(x_test),-1)
X_test = X_test.astype(float) / 255.
kmeans = MiniBatchKMeans(n_clusters = 20)
kmeans.fit(X)
cluster_labels = infer_cluster_labels(kmeans, Y)
test_clusters = kmeans.predict(X_test)
predicted_labels = infer_data_labels(kmeans.predict(X_test), cluster_labels)
print('Accuracy: {}\n'.format(metrics.accuracy_score(y_test, predicted_labels)))
print('Precision: {}\n'.format(metrics.precision_score(y_test, predicted_labels,average=None)))
print('Recall: {}\n'.format(metrics.recall_score(y_test, predicted_labels,average=None)))
print('Jaccard: {}\n'.format(metrics.jaccard_score(y_test, predicted_labels,average=None)))

Accuracy: 0.6826

Precision: [0.89530333 0.85813416 0.93015134 0.42059673 0.47440585 0.
 0.90985325 0.70484581 0.66929134 0.53566959]

Recall: [0.93367347 0.98061674 0.77422481 0.86534653 0.52851324 0.
 0.90605428 0.77821012 0.52361396 0.42418236]

Jaccard: [0.84176633 0.84382108 0.73168498 0.39476061 0.33333333 0.
 0.83141762 0.58694057 0.41598695 0.31014493]



  _warn_prf(average, modifier, msg_start, len(result))
