[View in Colaboratory](https://colab.research.google.com/github/emjames/neural-networks/blob/master/KMeans.ipynb)

# K-Means implementation with TensorFlow

Apply K-Means to classify handwritten digit images(MNIST)

In [0]:
# SETUP
import numpy as np
import tensorflow as tf
from tensorflow import keras
# Ops and modules related to factorization.
from tensorflow.contrib.factorization import KMeans

# Ignore GPUs as tf random forest doesn't benefit from it
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""


In [0]:
# IMPORT DATA
# mnist = keras.datasets.mnist
# (x_train, y_train), (x_test, y_test) = mnist.load_data()
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
full_data_x = mnist.train.images

In [0]:
# EXPLORE DATA
x_train.shape
# > (60000, 28, 28)
x_test.shape
# > (10000, 28, 28)

In [0]:
# DEFINE PARAMETERS
# Total steps to train
num_steps = 50
# The number of samples per batch
batch_size = 1024
# The number of clusters
k = 25
# The 10 digits
num_classes = 10
# Each image is 28x28 pixels
num_features = 784

In [0]:
# Input images
X = tf.placeholder(tf.float32, shape=[None, num_features])
# Labels
Y = tf.placeholder(tf.float32, shape=[None, num_classes])

In [0]:
# K-Means parameters
kmeans = KMeans(inputs=X, num_clusters=k, distance_metric='cosine')
# Build KMeans graph
training_graph = kmeans.training_graph()

[```kmeans.training_graph()```](https://www.tensorflow.org/api_docs/python/tf/contrib/factorization/KMeans)

**Returns**  
A tuple consisting of:
* all_scores: A matrix (or list of matrices) of dimensions (num_input, num_clusters) where the value is the distance of an input vector and a cluster center. 

* cluster_idx: A vector (or list of vectors). Each element in the vector corresponds to an input row in 'inp' and specifies the cluster id corresponding to the input. 

* scores: Similar to cluster_idx but specifies the distance to the assigned cluster instead. 

* cluster_centers_initialized: scalar indicating whether clusters have been initialized. 

* init_op: an op to initialize the clusters. 

* training_op: an op that runs an iteration of training.

In [0]:
# Build KMeans graph
(all_scores, cluster_idx, scores, cluster_centers_initialized,
init_op, train_op) =training_graph()

cluster_idx = cluster_idx[0]
avg_distance = tf.reduce_mean(scores)

# Initialize the variables
init_vars = tf.global_variables_initializer()

with tf.Session() as sess:
  sess.run(init_vars, feed_dict={X: full_data_x})
  sess.run(init_op, feed_dict={X: full_data_x})

In [0]:
# Training
for i in range(1, num_steps + 1):
    _, d, idx = sess.run([train_op, avg_distance, cluster_idx],
                         feed_dict={X: full_data_x})
    if i % 10 == 0 or i == 1:
        print("Step %i, Avg Distance: %f" % (i, d))

# Assign a label to each centroid
# Count total number of labels per centroid, using the label of each training
# sample to their closest centroid (given by 'idx')
counts = np.zeros(shape=(k, num_classes))
for i in range(len(idx)):
    counts[idx[i]] += mnist.train.labels[i]
# Assign the most frequent label to the centroid
labels_map = [np.argmax(c) for c in counts]
labels_map = tf.convert_to_tensor(labels_map)

In [0]:
	# Evaluation ops
# Lookup: centroid_id -> label
cluster_label = tf.nn.embedding_lookup(labels_map, cluster_idx)
# Compute accuracy
correct_prediction = tf.equal(cluster_label, tf.cast(tf.argmax(Y, 1), tf.int32))
accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# Test Model
test_x, test_y = mnist.test.images, mnist.test.labels
print("Test Accuracy:", sess.run(accuracy_op, feed_dict={X: test_x, Y: test_y}))