# SV-Softmax math

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import array_ops



Support vector guided softmax loss - is a novel loss function which adaptively emphasizes the mis-classified points (support vectors) to guide the discriminative features learning. It makes it close to hard negative mining and the Focal loss techniques.

Let's define a binary mask to adaptively indicate whether a sample is selected as the support vector by a specific classifier in the current stage. To the end, the binary mask is defined as follows:

$$
I_k = \left\{
        \begin{array}{ll}
            0, & \quad \cos(\theta_{w_y}, x) − \cos(\theta_{w_k}, x) \ge 0 \\
            1, & \quad \cos(\theta_{w_y}, x) − \cos(\theta_{w_k}, x) < 0
        \end{array}
      \right.
$$

where $\cos(\theta_{w_k}, x) = w_k^Tx$ is the cosine similarity and $θ_{w_k,x}$ is the angle between $w_k$ and $x$.

In [11]:
# placeholders
logits = tf.placeholder(tf.float32)
y_true = tf.placeholder(tf.float32)

zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
ones = array_ops.ones_like(logits, dtype=logits.dtype)

logit_y = tf.reduce_sum(tf.multiply(y_true, logits), axis=-1, keepdims=True)
I_k = array_ops.where(logit_y >= logits, zeros, ones)

with tf.Session() as sess:
    logits_array = np.array([[2., 3., 1., -1.], [-1., 2.1, 2., 6], [-2., 3., 4, -2.1]])
    y_true_array = np.array([[0., 1., 0., 0], [0., 0., 1., 0], [1., 0., 0., 0.]])
    binary_mask = (sess.run(I_k, feed_dict={logits: logits_array, y_true: y_true_array}))
    
print("Logits:")
print(logits_array)
print('')
print("GT:")
print(y_true_array)
print('')
print("Binary mask:")
print(binary_mask)

Logits:
[[ 2.   3.   1.  -1. ]
 [-1.   2.1  2.   6. ]
 [-2.   3.   4.  -2.1]]

GT:
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]

Binary mask:
[[0. 0. 0. 0.]
 [0. 1. 0. 1.]
 [0. 1. 1. 0.]]


Let's also define indicator function $h(t, θ_{w_k}, x, I_k)$ with preset hyperparameter t:

$$h(t, θ_{w_k}, x, I_k) = e^{s(t−1)(\cos(\theta_{w_k, x})+1)I_k}$$

Obviously, when t = 1, the designed SV-Softmax loss becomes identical to the original softmax loss. Let's implement it in a naive way.

In [20]:
# placeholders
t = tf.placeholder(tf.float32)
s = tf.placeholder(tf.float32)
logits = tf.placeholder(tf.float32)
y_true = tf.placeholder(tf.float32)
epsilon = 1.e-9

zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
ones = array_ops.ones_like(logits, dtype=logits.dtype)

logit_y = tf.reduce_sum(tf.multiply(y_true, logits), axis=-1, keepdims=True)
I_k = array_ops.where(logit_y >= logits, zeros, ones)

h = tf.exp(s * tf.multiply(t - 1., tf.multiply(logits + 1., I_k)))


# Let's check
logits_array = np.array([[2., 3., 1., -1.], [-1., 2.1, 2., 6], [-2., 3., 4, -2.1]])
y_true_array = np.array([[0., 1., 0., 0], [0., 0., 1., 0], [1., 0., 0., 0.]])
    
with tf.Session() as sess:
    h_array_12 = (sess.run(h, feed_dict={t: 1.2, s: 1, logits: logits_array, y_true: y_true_array}))

with tf.Session() as sess:
    h_array_1 = (sess.run(h, feed_dict={t: 1., s: 1, logits: logits_array, y_true: y_true_array}))
    
print("h with t=1.2:")
print(h_array_12)
print('')
print("h with t=1.0:")
print(h_array_1)

h with t=1.2:
[[1.        1.        1.        1.       ]
 [1.        1.8589282 1.        4.055201 ]
 [1.        2.2255414 2.7182825 1.       ]]

h with t=1.0:
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


Full loss is formulated:
$$\mathcal{L} = -log\frac{e^{s\cos(\theta_{w_y}, x)}}{e^{s\cos(\theta_{w_y}, x)}+\sum_{k\ne y}^Kh(t, θ_{w_k}, x, I_k)e^{s\cos(\theta_{w_k, x})}}$$

In [30]:
# placeholders
t = tf.placeholder(tf.float32)
s = tf.placeholder(tf.float32)
logits = tf.placeholder(tf.float32)
y_true = tf.placeholder(tf.float32)
epsilon = 1.e-9

zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
ones = array_ops.ones_like(logits, dtype=logits.dtype)

logit_y = tf.reduce_sum(tf.multiply(y_true, logits), axis=-1, keepdims=True)
I_k = array_ops.where(logit_y >= logits, zeros, ones)

h = tf.exp(s * tf.multiply(t - 1., tf.multiply(logits + 1., I_k)))


softmax = tf.exp(s * logits) / (tf.reshape(
                 tf.reduce_sum(tf.multiply(tf.exp(s * logits), h), axis=-1, keepdims=True), 
                 [-1, 1]) + epsilon)

tf_softmax = tf.nn.softmax(logits)

# Let's check softmax
logits_array = np.array([[2., 3., 1., -1.], [-1., 2.1, 2., 6], [-2., 3., 4, -2.1]])
y_true_array = np.array([[0., 1., 0., 0], [0., 0., 1., 0], [1., 0., 0., 0.]])
    
with tf.Session() as sess:
    softmax_array_12 = (sess.run(softmax, feed_dict={t: 1.2, s: 1, logits: logits_array, y_true: y_true_array}))

with tf.Session() as sess:
    softmax_array_1 = (sess.run(softmax, feed_dict={t: 1., s: 1, logits: logits_array, y_true: y_true_array}))

with tf.Session() as sess:
    tf_softmax_array = (sess.run(tf_softmax, feed_dict={t: 1., s: 1, logits: logits_array, y_true: y_true_array}))
    
print("Softmax with t=1.2:")
print(softmax_array_12)
print('')
print("Softmax with t=1.0:")
print(softmax_array_1)
print('')
print("Pure softmax:")
print(tf_softmax_array)
print('')
print("Maximum absolute error between our and tf sodtmax:")
print(abs((tf_softmax_array-softmax_array_1)).max())

Softmax with t=1.2:
[[2.4178252e-01 6.5723300e-01 8.8946812e-02 1.2037643e-02]
 [2.2175812e-04 4.9225753e-03 4.4541308e-03 2.4318731e-01]
 [6.9986947e-04 1.0386984e-01 2.8234750e-01 6.3326815e-04]]

Softmax with t=1.0:
[[2.4178252e-01 6.5723300e-01 8.8946812e-02 1.2037643e-02]
 [8.7725709e-04 1.9473307e-02 1.7620180e-02 9.6202922e-01]
 [1.8058796e-03 2.6801631e-01 7.2854382e-01 1.6340276e-03]]

Pure softmax:
[[2.4178253e-01 6.5723306e-01 8.8946819e-02 1.2037644e-02]
 [8.7725703e-04 1.9473307e-02 1.7620180e-02 9.6202922e-01]
 [1.8058793e-03 2.6801628e-01 7.2854376e-01 1.6340273e-03]]

Maximum absolute error between our and tf sodtmax:
5.9604645e-08


In [35]:
# placeholders
t = tf.placeholder(tf.float32)
m = tf.placeholder(tf.float32)
logits = tf.placeholder(tf.float32)
y_true = tf.placeholder(tf.float32)


zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
ones = array_ops.ones_like(logits, dtype=logits.dtype)

logit_y = tf.reduce_sum(tf.multiply(y_true, logits), axis=-1, keepdims=True)
I_k = array_ops.where(logit_y >= logits, zeros, ones)
I_k = array_ops.where(logit_y - m >= logits, zeros, ones)
I_k_ = I_k * tf.cast(tf.not_equal(y_true, 1), tf.float32)
h = tf.exp(tf.multiply(t - 1., tf.multiply(logits + 1., I_k)))

softmax = tf.exp(logits - m * y_true) / tf.reduce_sum(tf.multiply(tf.exp(logits - m * y_true), h), 
                                                      axis=-1, keepdims=True)
# softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis=-1, keepdims=True)

# softmax = tf.nn.softmax(logits)

# ce = tf.multiply(y_true, -tf.log(softmax))
# ce = tf.reduce_sum(ce, axis=1)
# ce = tf.reduce_mean(ce)
# ce = tf.reduce_mean(-tf.reduce_sum(y_true * tf.log(softmax), reduction_indices=[1]))
# ce = tf.losses.softmax_cross_entropy(y_true, softmax)
# ce = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true, logits=logits) 
# ce = tf.reduce_mean(ce)

with tf.Session() as sess:
    logits_array = np.array([[2., 3., 1., -1.], [-1., 2.1, 2., 6], [-2., 3., 4, -2.1]])
    y_true_array = np.array([[0., 1., 0., 0], [0., 0., 1., 0], [1., 0., 0., 0.]])
    print(sess.run(ce, feed_dict={s: 1, t: 1.2, m: 0., logits: logits_array, y_true: y_true_array}))

3.5917118


In [31]:
# placeholders
epsilon = 1.e-9
s = tf.placeholder(tf.float32)
t = tf.placeholder(tf.float32)
m = tf.placeholder(tf.float32)
logits = tf.placeholder(tf.float32)
y_true = tf.placeholder(tf.float32)

zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
ones = array_ops.ones_like(logits, dtype=logits.dtype)

# score of groundtruth
logit_y = tf.reduce_sum(tf.multiply(y_true, logits), axis=-1, keepdims=True)

# binary mask for support vectors
I_k = array_ops.where(logit_y >= logits, zeros, ones)

# indicator function
h = tf.exp(s * tf.multiply(t - 1., tf.multiply(logits + 1., I_k)))

softmax = tf.exp(s * logits) / (tf.reshape(
                 tf.reduce_sum(tf.multiply(tf.exp(s * logits), h), axis=-1, keepdims=True), 
                 [-1, 1]) + epsilon)
ce = tf.reduce_mean(-tf.reduce_sum(y_true * tf.log(softmax), reduction_indices=[1]))

with tf.Session() as sess:
    logits_array = np.array([[2., 3., 1., -1.], [-1., 2.1, 2., 6], [-2., 3., 4, -2.1]])
    y_true_array = np.array([[0., 1., 0., 0], [0., 0., 1., 0], [1., 0., 0., 0.]])
    print(sess.run(ce, feed_dict={s: 1, t: 1.2, m: 0., logits: logits_array, y_true: y_true_array}))

4.3660855


In [32]:
# placeholders
epsilon = 1.e-9
s = tf.placeholder(tf.float32)
t = tf.placeholder(tf.float32)
m = tf.placeholder(tf.float32)
logits = tf.placeholder(tf.float32)
y_true = tf.placeholder(tf.float32)

zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
ones = array_ops.ones_like(logits, dtype=logits.dtype)

# score of groundtruth
logit_y = tf.reduce_sum(tf.multiply(y_true, logits), axis=-1, keepdims=True)

# binary mask for support vectors
I_k = array_ops.where(logit_y >= logits, zeros, ones)

# indicator function
h = tf.exp(s * tf.multiply(t - 1., tf.multiply(logits + 1., I_k)))

h = s * tf.multiply(t - 1., tf.multiply(logits + 1., I_k))
ce = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true, logits=tf.add(s * logits, h))
ce = tf.reduce_mean(ce)
# softmax = tf.exp(s * logits) / (tf.reshape(
#                  tf.reduce_sum(tf.multiply(tf.exp(s * logits), h), axis=-1, keepdims=True), 
#                  [-1, 1]) + epsilon)
# ce = tf.reduce_mean(-tf.reduce_mean(y_true * tf.log(softmax), reduction_indices=[1]))

with tf.Session() as sess:
    logits_array = np.array([[2., 3., 1., -1.], [-1., 2.1, 2., 6], [-2., 3., 4, -2.1]])
    y_true_array = np.array([[0., 1., 0., 0], [0., 0., 1., 0], [1., 0., 0., 0.]])
    print(sess.run(ce, feed_dict={s: 1, t: 1.2, m: 0., logits: logits_array, y_true: y_true_array}))

4.3660855


In [59]:
with tf.Session() as sess:
    logits_array = np.array([[2., 3., 1., -1.], [-1., 2.1, 2., 6], [-2., 3., 4, -2.1]])
    y_true_array = np.array([[0., 1., 0., 0], [0., 0., 1., 0], [1., 0., 0., 0.]])
    print(sess.run(ce, feed_dict={s: 1, t: 1.05, m: 0., logits: logits_array, y_true: y_true_array}))

3.5917118


In [39]:
with tf.Session() as sess:
    logits_array = np.array([[2., 3., 1., -1.], [-1., 2.1, 2., 6], [-2., 3., 4, -2.1]])
    y_true_array = np.array([[0., 1., 0., 0], [0., 0., 1., 0], [1., 0., 0., 0.]])
    print(sess.run(softmax, feed_dict={s: 1, t: 1.05, m: 0., logits: logits_array, y_true: y_true_array}))

[[2.4178253e-01 6.5723306e-01 8.8946819e-02 1.2037644e-02]
 [8.7725703e-04 1.9473307e-02 1.7620180e-02 9.6202922e-01]
 [1.8058793e-03 2.6801628e-01 7.2854376e-01 1.6340273e-03]]


In [None]:
import tensorflow as tf
from tensorflow.python.ops import array_ops


def sv_softmax_loss(t=1.0, s=1):

    t = float(t)
    s = float(s)
    
    def sv_softmax_loss_fixed(y_true, logits):
        """SV-Softmax loss
        Notice: y_pred is raw logits
        Support Vector Guided Softmax Loss for Face Recognition
        https://arxiv.org/pdf/1812.11317.pdf

        Arguments:
            y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
            y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

        Keyword Arguments:

        Returns:
            [tensor] -- loss.
        """
        epsilon = 1.e-9
        zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
        ones = array_ops.ones_like(logits, dtype=logits.dtype)
        
        logit_y = tf.reduce_sum(tf.multiply(y_true, logits), axis=-1, keepdims=True)
        I_k = array_ops.where(logit_y >= logits, zeros, ones)
        
        h = tf.exp(s * tf.multiply(t - 1., tf.multiply(logits + 1., I_k)))
        
        softmax = tf.exp(s * logits) / (tf.reshape(
                         tf.reduce_sum(tf.multiply(tf.exp(s * logits), h), axis=-1, keepdims=True), 
                         [-1, 1]) + epsilon)
        
        # We add epsilon because log(0) = nan
        softmax = tf.add(softmax, epsilon)
        ce = tf.multiply(y_true, -tf.log(softmax))
        ce = tf.reduce_sum(ce, axis=1)
        return tf.reduce_mean(ce)
    
    return sv_softmax_loss_fixed


def sv_am_softmax_loss(t=1.0, s=1, m=0.35):

    t = float(t)
    s = float(s)
    m = float(m)
    
    def sv_am_softmax_loss_fixed(y_true, logits):
        """Softmax loss for multi-classification
        Notice: y_pred is raw logits
        Support Vector Guided Softmax Loss for Face Recognition
        https://arxiv.org/pdf/1812.11317.pdf

        Arguments:
            y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
            y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

        Keyword Arguments:

        Returns:
            [tensor] -- loss.
        """
        epsilon = 1.e-9
        zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
        ones = array_ops.ones_like(logits, dtype=logits.dtype)
        
        logit_y = tf.reduce_sum(tf.multiply(y_true, logits), axis=-1, keepdims=True)
        I_k = array_ops.where(logit_y - m >= logits, zeros, ones)
        
        # I_k should be zero for GT score
        I_k = I_k * tf.cast(tf.not_equal(y_true, 1), tf.float32)
        # h = tf.exp(s * tf.multiply(t - 1., tf.multiply(logits + 1., I_k)))

#         logits = logits - m * y_true
#         softmax = tf.exp(logits- m * y_true) / tf.reduce_sum(tf.multiply(tf.exp(logits- m * y_true), h), 
#                                                  axis=-1, keepdims=True)

#         softmax = tf.exp(s * (logits- m * y_true)) / (tf.reshape(
#                   tf.reduce_sum(tf.multiply(tf.exp(s * (logits- m * y_true)), h), axis=-1, keepdims=True), 
#                   [-1, 1]) + epsilon)
        
        # We add epsilon because log(0) = nan
#         softmax = tf.add(softmax, epsilon)
#         ce = tf.multiply(y_true, -tf.log(softmax))
#         ce = tf.reduce_sum(ce, axis=1)

        h = s * tf.multiply(t - 1., tf.multiply(logits + 1., I_k))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true, logits=tf.add(s * (logits- m * y_true), h))
        # ce = tf.reduce_mean(ce)
        return tf.reduce_mean(ce)
    
    return sv_am_softmax_loss_fixed


def focal_loss(gamma=2., alpha=4.):

    gamma = float(gamma)
    alpha = float(alpha)

    def focal_loss_fixed(y_true, y_pred):
        """Focal loss for multi-classification
        FL(p_t)=-alpha(1-p_t)^{gamma}ln(p_t)
        Notice: y_pred is probability after softmax
        gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper
        d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x)
        Focal Loss for Dense Object Detection
        https://arxiv.org/abs/1708.02002

        Arguments:
            y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
            y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

        Keyword Arguments:
            gamma {float} -- (default: {2.0})
            alpha {float} -- (default: {4.0})

        Returns:
            [tensor] -- loss.
        """
        epsilon = 1.e-9
        y_true = tf.convert_to_tensor(y_true, tf.float32)
        y_pred = tf.convert_to_tensor(y_pred, tf.float32)
        y_pred = tf.nn.softmax(y_pred)
        
        model_out = tf.add(y_pred, epsilon)
        ce = tf.multiply(y_true, -tf.log(model_out))
        weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
        fl = tf.multiply(alpha, tf.multiply(weight, ce))
        reduced_fl = tf.reduce_max(fl, axis=1)
        return tf.reduce_mean(reduced_fl)
    return focal_loss_fixed
