# Classification as Clustering

In [1]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
from keras.layers import Input, Dense, Lambda, Flatten, Reshape, Conv2D, Conv2DTranspose, merge
from keras.optimizers import *
from keras.models import Model, Sequential
from keras.datasets import mnist
from keras.utils import to_categorical
from keras import initializers
from keras import constraints
from keras import losses

import numpy as np
import tensorflow as tf

import random as rand
from scipy.spatial.distance import pdist

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [3]:
latent_dim = 2
input_size = (28,28,1)
num_classes = 10

# Load Data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32') / 255.
x_train = x_train.reshape((x_train.shape[0],) + input_size)
x_test = x_test.astype('float32') / 255.
x_test = x_test.reshape((x_test.shape[0],) + input_size)

In [4]:
def get_feature_extractor():
    extractor = Sequential()
    extractor.add(Conv2D(32, (3,3), input_shape=(28,28,1), activation='relu', strides=(2,2), padding='same'))
    extractor.add(Conv2D(64, (3,3), activation='relu', strides=(2,2), padding='same'))
    extractor.add(Conv2D(128, (3,3), activation='relu', strides=(2,2), padding='valid'))
    extractor.add(Conv2D(128, (3,3), activation='relu', strides=(1,1), padding='same', name='conv4'))
    extractor.add(Flatten())
    extractor.add(Dense(256, activation='relu'))
    extractor.add(Dense(latent_dim, activation=None))
    
    return extractor

### Cluster Loss ([paper](http://ydwen.github.io/papers/WenECCV16.pdf))

In [None]:
# Network Definition

input_layer = Input(input_size)
base_net = get_feature_extractor()
feature_vec = base_net(input_layer)
softmax_layer = Dense(num_classes, use_bias=False, activation='softmax')
prediction = softmax_layer(feature_vec)

# A Tensorflow variable for the class centers that we can update per iteration of training:
class_centers = tf.Variable(tf.random_normal([num_classes,2], stddev=0.3))

# Here we define a loss that combines cross-entropy with the class-center distances.
# The lambda and alpha parameters are taken as arguments
def cluster_loss(labels, x, lam=0.07, alpha=0.5):
    # x is the feature vector before the softmax
    global class_centers
    
    # cross-entropy:
    xent_loss = losses.categorical_crossentropy(labels, x)
    
    # Compute the distances from the current class-centers to add to loss
    classes = tf.argmax(labels, axis=1)
    c = tf.gather( class_centers, classes )
    c_dist = K.sum(K.square(feature_vec-c), axis=1)
    c_loss = 0.5 * tf.reduce_mean(c_dist)
    
    # How many samples per class in this batch:
    class_nums = K.sum(labels, axis=0)
    
    # compute the differences for the class center update
    diffs = []
    for i in range(0, num_classes):
        diff = tf.cast(tf.equal(tf.cast(classes, tf.int32), tf.constant(i)) ,tf.float32) * (c-feature_vec)
        diff = K.sum(diff, axis=0, keepdims=True)
        diff = diff / (tf.gather(class_nums, i)+1)
        diffs.append(diff)
    diffs = tf.concat(diffs, axis=0)
    
    # Update:
    class_centers = class_centers - (alpha*diffs)
    
    return xent_loss + lam*c_loss


model = Model(input_layer, prediction)
model.compile(loss=cluster_loss, optimizer=Adam())

In [None]:
model.fit(x_train, to_categorical(y_train), epochs=10, batch_size=128)

Epoch 1/10


In [None]:
tr_acc = (model.predict(x_train).argmax(axis=1) == y_train).sum() / float(y_train.shape[0])
te_acc = (model.predict(x_test).argmax(axis=1) == y_test).sum() / float(y_test.shape[0])

print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

In [None]:
encoder = Model(input_layer, feature_vec)

test_by_class = []
for i in range(0,10):
    inds = (y_test == i)
    test_by_class.append(x_test[inds,:,:,:])
    
embs_by_class = []
for val in test_by_class:
    embs_by_class.append(base_net.predict(val, batch_size=100))

plt.figure()
colors = ['b', 'c', 'g', [0.3, 0.3, 0.3], 'm', 'r', 'y', [1., 1., 0.5], 'w', [0.25, 1.0, 0.25]]
for i, embs in enumerate(embs_by_class[:]):
    #embs = embs / np.expand_dims(np.linalg.norm(embs,axis=1),-1)
    #inds = (embs[:,0] > -60.) * (embs[:,0] < 20) * (embs[:,1] < 40) * (embs[:,1] > -40)
    plt.scatter(embs[:,0],embs[:,1], c=colors[i]) 

In [None]:
def get_random_neg_pairs(embs_by_class, num_pairs):
    dists = np.zeros((num_pairs,))
    for i in range(0,num_pairs):
        class_ind_1 = np.random.randint(0,10)
        class_ind_2 = rand.sample(range(0,class_ind_1)+range(class_ind_1+1,10), 1)[0]
        sample1 = np.random.randint(0,embs_by_class[class_ind_1].shape[0])
        sample2 = np.random.randint(0,embs_by_class[class_ind_2].shape[0])
        
        dists[i] = np.linalg.norm(embs_by_class[class_ind_1][sample1] - embs_by_class[class_ind_2][sample2])
    return dists

def get_random_pos_pairs(embs_by_class, num_pairs):
    dists = np.zeros((num_pairs,))
    for i in range(0,num_pairs):
        class_ind = np.random.randint(0,10)
        while True:
            sample1 = np.random.randint(0,embs_by_class[class_ind].shape[0])
            sample2 = np.random.randint(0,embs_by_class[class_ind].shape[0])
            if sample1 is not sample2:
                break
        dists[i] = np.linalg.norm(embs_by_class[class_ind][sample1] - embs_by_class[class_ind][sample2])
    return dists

plt.figure()
out = plt.hist(get_random_pos_pairs(embs_by_class, 5000), bins=100)
out = plt.hist(get_random_neg_pairs(embs_by_class, 5000), bins=100)

### Cosine Similarity Softmax ([paper](https://arxiv.org/pdf/1704.06369.pdf))
We do this in practice just by normalizing the feature vector and adding a constraint of row-wise normalization to the weight matrix in the softmax layer

In [None]:
input_layer = Input(input_size)
base_net = get_feature_extractor()
feature_vec = base_net(input_layer)

# We're using 100 as the scale factor, which is certainly larger than the actual
# required scale for 10 classes.
feature_vec = Lambda(lambda x: 100. * K.l2_normalize(x,axis=0))(feature_vec)
softmax_layer = Dense(num_classes, use_bias=False, activation='softmax', kernel_constraint=constraints.UnitNorm())
prediction = softmax_layer(feature_vec)

model = Model(input_layer, prediction)
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [None]:
# Change batch size to change "tight-ness" of clusters
model.fit(x_train, to_categorical(y_train), epochs=10, batch_size=512)

In [None]:
# Let's check the scatter plot and distance distributions

encoder = Model(input_layer, feature_vec)

test_by_class = []
for i in range(0,10):
    inds = (y_test == i)
    test_by_class.append(x_test[inds,:,:,:])
    
embs_by_class = []
for val in test_by_class:
    embs_by_class.append(base_net.predict(val, batch_size=100))

plt.figure()
colors = ['b', 'c', 'g', [0.3, 0.3, 0.3], 'm', 'r', 'y', [1., 1., 0.5], 'w', [0.25, 1.0, 0.25]]
for i, embs in enumerate(embs_by_class[:]):
    #embs = embs / np.expand_dims(np.linalg.norm(embs,axis=1),-1)
    #inds = (embs[:,0] > -60.) * (embs[:,0] < 20) * (embs[:,1] < 40) * (embs[:,1] > -40)
    plt.scatter(embs[:,0],embs[:,1], c=colors[i]) 

In [None]:
plt.figure()
out = plt.hist(get_random_pos_pairs(embs_by_class, 5000), bins=100)
out = plt.hist(get_random_neg_pairs(embs_by_class, 5000), bins=100)