In [1]:

import sys
import sklearn
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.cluster import MiniBatchKMeans

print(f'Python: {sys.version}')
print(f'NumPy: {np.version}')

Python: 3.7.11 (default, Jul  3 2021, 18:01:19) 
[GCC 7.5.0]
NumPy: <module 'numpy.version' from '/usr/local/lib/python3.7/dist-packages/numpy/version.py'>


In [43]:
# Download the dataset 
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data(
    path='mnist.npz'
)
train_images = train_images / 255.0
test_images = test_images / 255.0

In [44]:
# Convert each image to 1 dimensional array
X = train_images.reshape(len(train_images),-1)
Y = train_labels
K = test_images.reshape(len(test_images),-1)
J = test_labels

In [45]:
# Clustering on the training dataset
n_digits = 120
kmeans = MiniBatchKMeans(n_clusters = n_digits, max_iter=10000, n_init=50)
kmeans.fit(X)


MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                init_size=None, max_iter=10000, max_no_improvement=10,
                n_clusters=120, n_init=50, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)

In [46]:
# Center of the clusters
from sklearn.metrics import pairwise_distances, pairwise_distances_argmin_min
# find the closest sample to the center of each cluster
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
# Map cluster to digit by using only the label of the closest element to the center
map_pred2digits ={}
# Map cluster to digit by using the most frequent label in the cluster 
# We won't use this for the rest of the experiment as it requires all the labels
map_pred2digits_with_all_labels ={}
predictions = kmeans.predict(X)

for cidx, c in enumerate(closest):
  map_pred2digits_with_all_labels[cidx] = np.argmax(np.bincount(Y[predictions==cidx]))
  map_pred2digits[cidx] = Y[c]

# Label all the dataset based on the label of the center of the clusters
dirty_labels = np.vectorize(map_pred2digits.get)(predictions)


In [47]:
# Evaluation on validation dataset
from sklearn.metrics import accuracy_score
predictions = kmeans.predict(K)
digits_predictions = np.vectorize(map_pred2digits.get)(predictions)
print('Accuracy using the label of the center of the cluster ',accuracy_score(J,digits_predictions))
digits_predictions = np.vectorize(map_pred2digits_with_all_labels.get)(predictions)
print('Accuracy if we had all the labels ', accuracy_score(J,digits_predictions))

Accuracy using the label of the center of the cluster  0.8674
Accuracy if we had all the labels  0.8702


# DIRTY LABELS ( Use the entire training dataset labelled with the 
#                 label of the closest sample to the center of each cluster )

In [48]:
# CNN with heavy dropout
input_shape = (28, 28, 1)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(32, [3, 3], activation='relu', input_shape=input_shape))
model.add(tf.keras.layers.Conv2D(32, [3, 3], activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2D(64, [3, 3], activation='relu'))
model.add(tf.keras.layers.Conv2D(64, [3, 3], activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [49]:
test_images = test_images.reshape(test_images.shape[0],28,28,1)
train_images = train_images.reshape(train_images.shape[0],28,28,1)

In [50]:
# DIRTY LABELS ( Use the entire training dataset labelled with the 
#                 label of closest sample to the center of the cluster )
n_epochs = 6
checkpoint_filepath = '/tmp/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True)
model.fit(train_images, dirty_labels, validation_data = (test_images,  test_labels), 
          epochs=n_epochs, verbose = 1, callbacks=[model_checkpoint_callback])
model.load_weights(checkpoint_filepath)
model.evaluate(test_images,  test_labels)


Epoch 1/6


  '"`sparse_categorical_crossentropy` received `from_logits=True`, but '


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


[0.3242577314376831, 0.9063000082969666]

# ONLY TRAIN WITH THE CLUSTER CENTROIDS


In [10]:
# CNN with heavy dropout
input_shape = (28, 28, 1)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(32, [3, 3], activation='relu', input_shape=input_shape))
model.add(tf.keras.layers.Conv2D(32, [3, 3], activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Conv2D(64, [3, 3], activation='relu'))
model.add(tf.keras.layers.Conv2D(64, [3, 3], activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [11]:
test_images = test_images.reshape(test_images.shape[0],28,28,1)
train_images = train_images.reshape(train_images.shape[0],28,28,1)

In [12]:
# ONLY TRAIN WITH THE CLUSTER CENTERS
labelled_indices = closest
labelled_indices.shape, np.bincount(train_labels[labelled_indices])
n_epochs = 250

In [13]:
model.fit(train_images[labelled_indices], train_labels[labelled_indices], validation_data = (test_images,  test_labels), epochs=n_epochs, verbose = 1)
model.evaluate(test_images,  test_labels)


Epoch 1/250


  '"`sparse_categorical_crossentropy` received `from_logits=True`, but '


Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 7

[0.43556538224220276, 0.904699981212616]