# Softmax activation and CrossEntropy loss

In this notebook we're going to go over Softmax as an activation function (which returns probability) and CrossEntropy as a loss function. <br>

Softmax is an activation function that is commonly used for classification tasks, it is applied before the output layer to squash the incoming values between 0 and 1 to represent a list of probabilities, and the highest probability is taken as the answer for classification. <br>

CrossEntropy loss measures the performance of a classification model whose output is a probability between 0 and 1, and can be used in multi class problems. The loss increases as the predicted probability diverges from the actual label, so the better the prediction, the lower the loss will be.

In [1]:
# Imports
import torch
import torch.nn as nn
import numpy as np

In [2]:
# A numpy implementation of softmax
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

x = np.array([2.0, 1.0, 0.1])
outputs = softmax(x)
print("Softmax numpy: ", outputs)

# Torch version
x = torch.tensor([2.0, 1.0, 0.1])
outputs = torch.softmax(x, dim=0)  # Computes along the first axis
print("Softmax torch: ", outputs)

Softmax numpy:  [0.65900114 0.24243297 0.09856589]
Softmax torch:  tensor([0.6590, 0.2424, 0.0986])


In [8]:
# A numpy implementation of cross entropy
def cross_entropy(actual, predicted):
    loss = -np.sum(actual * np.log(predicted))
    return loss  # / float(predicted.shape[0])
    # The commented out part would be used to normalize it

# The input to cross entropy must be a one hot encoded vector
y = np.array([1, 0, 0])  # Class 1, 2, 3 represented as True/False by 1/0
y_pred_good = np.array([0.7, 0.2, 0.1])
y_pred_bad = np.array([0.1, 0.3, 0.6])
l1 = cross_entropy(y, y_pred_good)
l2 = cross_entropy(y, y_pred_bad)
print(f"Cross entropy numpy good: {l1:.4f}")
print(f"Cross entropy numpy bad:  {l2:.4f}")

# Torch version
# The loss function in torch already implements softmax so don't implement it in the model
# Additionally there is no need to one-hot encode the labels, torch will do it
# Leave the class labels as they are
# The predictions will have raw scores, no softmax
loss = nn.CrossEntropyLoss()  

# 3 samples
y = torch.tensor([2, 0, 1])  # Class labels, not one hot encoded
# Size: n_samples x n_classes = 3 x 3
# Raw values, no softmax
y_pred_good = torch.tensor([[0.1, 1.0, 2.1], [2.0, 1.0, 0.1], [0.1, 3.0, 0.1]]) 
y_pred_bad = torch.tensor([[2.1, 1.0, 0.1], [0.1, 1.0, 2.1], [0.1, 3.0, 0.1]])

l1 = loss(y_pred_good, y)
l2 = loss(y_pred_bad, y)

print(f"Cross entropy torch good: {l1.item():.4f}")
print(f"Cross entropy torch bad:  {l2.item():.4f}")

# Let's get the actual predictions
_, predictions1 = torch.max(y_pred_good, 1)  # along the first dimension
_, predictions2 = torch.max(y_pred_bad, 1)

print(f"Predictions good: {predictions1}")
print(f"Predictions bad:  {predictions2}")

# NOTE: the loss in pytorch allows for multiple samples as demonstrated


Cross entropy numpy good: 0.3567
Cross entropy numpy bad:  2.3026
Cross entropy torch good: 0.3018
Cross entropy torch bad:  1.6242
Predictions good: tensor([2, 0, 1])
Predictions bad:  tensor([0, 2, 1])


In [9]:
# This is how a model might be implemented for multiclass problems
# Keeping in mind not to use softmax in the model
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, num_classes)  # output size 1 for binary classification

    def forward(self, x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        # NO SOFTMAX
        return out
        # FOR BINARY CLASSIFICATION
        # y_pred = torch.sigmoid(out)
        # return y_pred

model = NeuralNet(input_size=28*28, hidden_size=5, num_classes=3)
loss = nn.CrossEntropyLoss()  # Applies softmax
# loss = nn.BCELoss()  # for binary classification
# We would use sigmoid for single output and use binary cross entropy loss
