In [1]:
import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt
import plotly.express as px

import tensorflow as tf
from tensorflow.keras.layers import Input, BatchNormalization, Conv2D, Dense, MaxPool2D, Flatten, Activation, Dropout, Lambda,Layer
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.python.keras.utils import tf_utils

In this notebook, I'd like to introduce *ArcFace loss*, which results in <b>feature extraction</b> with better <b>intra-class compactness</b> and <b>inter-class discrepency</b>

I will train Convolutional digit recognizer model with both **Cross Entropy Loss** and **ArcFace Loss**, and then visualize extracted features on 3D vector space. 



# 1. Loading Data

In [2]:
train = pd.read_csv('../input/digit-recognizer/train.csv')
test = pd.read_csv('../input/digit-recognizer/test.csv')

In [3]:
label = train['label'].values
y_tr = label[:40000]
y_val = label[40000:]

X_train = train.drop('label', axis = 1)
X_tr = X_train.values.reshape((42000, 28, 28, 1))[:40000]/255.0
X_val = X_train.values.reshape((42000, 28, 28, 1))[40000:]/255.0

# 2. Training without ArcFace layer

## 1. Model training

In [4]:
# building basic CNN architecture
model = tf.keras.models.Sequential()
model.add(Conv2D(input_shape=(28,28,1),filters=10, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(filters=10, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))

model.add(Conv2D(filters=20, kernel_size=(3,3), padding="same", activation="relu"))
model.add(Conv2D(filters=20, kernel_size=(3,3), padding="same", activation="relu"))
model.add(MaxPool2D(pool_size=(2,2),strides=(2,2)))

model.add(Flatten())
model.add(Dense(50, activation = 'relu'))
model.add(Dense(3, activation = 'linear'))
model.add(Lambda(lambda x: K.l2_normalize(x,axis=1)))
model.add(Dense(10, activation = 'softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 28, 28, 10)        100       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 28, 28, 10)        910       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 10)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 14, 14, 20)        1820      
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 14, 14, 20)        3620      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 7, 7, 20)          0         
_________________________________________________________________
flatten (Flatten)            (None, 980)               0

In [5]:
# model compile and fitting
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer = 'adam', metrics = ['accuracy'])
model.fit(X_tr, y_tr, validation_data = (X_val, y_val),
          epochs = 1000, batch_size = 32,
          callbacks = tf.keras.callbacks.EarlyStopping(patience = 3, monitor = 'val_accuracy'))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000


<tensorflow.python.keras.callbacks.History at 0x7f0b72ba78d0>

## 2. Feature Extraction

In [6]:
# feature extractor trained with Cross Entropy loss
FE_CE = tf.keras.models.Model(model.layers[0].input, model.layers[-2].output)

# feature extraction on validation set
CE_feats = FE_CE.predict(X_val)

# dataframe for visualization
CE_df = pd.DataFrame(np.hstack([CE_feats, y_val.reshape(-1,1)]), columns = ['x','y','z','label'])
CE_df = CE_df.sort_values('label', ascending = True)
CE_df['label'] = CE_df['label'].astype(int).astype(str)

In [7]:
# extract class center from weight matrix columns of last layer 
centers = []
for i in range(10):
    centers.append(model.layers[-1].weights[0].numpy()[:,i])
centers = np.array(centers)

# 3. Training with ArcFace Layer

## 1. Defining ArcFace layer

In [8]:
def _resolve_training(layer, training):
    if training is None:
        training = K.learning_phase()
    if isinstance(training, int):
        training = bool(training)
    if not layer.trainable:
        # When the layer is not trainable, override the value
        training = False
    return training

class ArcFace(Layer):
    """
    Implementation of ArcFace layer. Reference: https://arxiv.org/abs/1801.07698
    
    Arguments:
      num_classes: number of classes to classify
      s: scale factor
      m: margin
      regularizer: weights regularizer
    """
    def __init__(self,
                 num_classes,
                 s=30.0,
                 m=0.5,
                 regularizer=None,
                 name='arcface',
                 **kwargs):
        
        super().__init__(name=name, **kwargs)
        self._n_classes = num_classes
        self._s = float(s)
        self._m = float(m)
        self._regularizer = regularizer

    def build(self, input_shapes):
        embedding_shape, label_shape = input_shapes
        self._w = self.add_weight(shape=(embedding_shape[-1], self._n_classes),
                                  initializer='glorot_uniform',
                                  trainable=True,
                                  regularizer=self._regularizer,
                                  name='cosine_weights')
    def call(self, inputs, training=None):
        """
        During training, requires 2 inputs: embedding (after backbone+pool+dense),
        and ground truth labels. The labels should be sparse (and use
        sparse_categorical_crossentropy as loss).
        """
        embedding, label = inputs

        # Squeezing is necessary for Keras. It expands the dimension to (n, 1)
        label = tf.reshape(label, [-1], name='label_shape_correction')

        # Normalize features and weights and compute dot product
        x = tf.nn.l2_normalize(embedding, axis=1, name='normalize_prelogits')
        w = tf.nn.l2_normalize(self._w, axis=0, name='normalize_weights')
        cosine_sim = tf.matmul(x, w, name='cosine_similarity')

        training = _resolve_training(self, training)
        if not training:
            # We don't have labels if we're not in training mode
            return self._s * cosine_sim
        else:
            one_hot_labels = tf.one_hot(label,
                                        depth=self._n_classes,
                                        name='one_hot_labels')
            theta = tf.math.acos(K.clip(
                    cosine_sim, -1.0 + K.epsilon(), 1.0 - K.epsilon()))
            selected_labels = tf.where(tf.greater(theta, math.pi - self._m),
                                       tf.zeros_like(one_hot_labels),
                                       one_hot_labels,
                                       name='selected_labels')
            final_theta = tf.where(tf.cast(selected_labels, dtype=tf.bool),
                                   theta + self._m,
                                   theta,
                                   name='final_theta')
            output = tf.math.cos(final_theta, name='cosine_sim_with_margin')
            return self._s * output

## 2. Model Training

In [9]:
input = Input(shape = (28,28,1), name = 'image')
label = Input(shape = (10,), dtype = np.int32)
x = Conv2D(filters = 10, kernel_size = (3,3), padding = 'same', activation = 'relu')(input)
x = Conv2D(filters = 10, kernel_size = (3,3), padding = 'same', activation = 'relu')(x)
x = MaxPool2D(pool_size=(2,2), strides=(2,2))(x)

x = Conv2D(filters = 20, kernel_size = (3,3), padding = 'same', activation = 'relu')(x)
x = Conv2D(filters = 20, kernel_size = (3,3), padding = 'same', activation = 'relu')(x)
x = MaxPool2D(pool_size=(2,2), strides=(2,2))(x)

x = Flatten()(x)
x = Dense(50, activation = 'relu')(x)
x = Dense(3, activation = 'linear')(x)
x = Lambda(lambda x: K.l2_normalize(x,axis=1))(x)
x = ArcFace(num_classes = 10, s = 30.0, m = 0.8)([x, label])
output = Activation('softmax')(x)

model_arc = Model([input, label], output)
model_arc.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image (InputLayer)              [(None, 28, 28, 1)]  0                                            
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 28, 28, 10)   100         image[0][0]                      
__________________________________________________________________________________________________
conv2d_5 (Conv2D)               (None, 28, 28, 10)   910         conv2d_4[0][0]                   
__________________________________________________________________________________________________
max_pooling2d_2 (MaxPooling2D)  (None, 14, 14, 10)   0           conv2d_5[0][0]                   
____________________________________________________________________________________________

In [10]:
# model compile and fitting
model_arc.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer = 'adam', metrics = ['accuracy'])
model_arc.fit([X_tr, y_tr], y_tr, validation_data = ([X_val, y_val], y_val),
          epochs = 1000, batch_size = 32,
          callbacks = tf.keras.callbacks.EarlyStopping(patience = 3, monitor = 'val_accuracy'))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000


<tensorflow.python.keras.callbacks.History at 0x7f0b638e4850>

## 3. Feature Extraction

In [11]:
# feature extractor trained with Cross Entropy loss
FE_ARC = tf.keras.models.Model(model_arc.layers[0].input, model_arc.layers[-4].output)

# feature extraction on validation set
ARC_feats = FE_ARC.predict(X_val)

# dataframe for visualization
ARC_df = pd.DataFrame(np.hstack([ARC_feats, y_val.reshape(-1,1)]), columns = ['x','y','z','label'])
ARC_df = ARC_df.sort_values('label', ascending = True)
ARC_df['label'] = ARC_df['label'].astype(int).astype(str)

# 3. Embedding Visualization

## 1. Visualization of Embeddings trained without ArcFace layer

In [12]:
fig = px.scatter_3d(CE_df, x='x', y='y', z='z', color='label')
fig.update_traces(marker=dict(size = 1),
                  selector=dict(mode='markers'))
fig.show()

## 2. Visualization of Embeddings trained with ArcFace layer

In [13]:
fig = px.scatter_3d(ARC_df, x='x', y='y', z='z', color='label')
fig.update_traces(marker=dict(size = 1),
                  selector=dict(mode='markers'))
fig.show()