# Playground for PyTorch Loss Functions

In [1]:
import os

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

## Import data from a CSV file using Pandas
Download link: https://archive.ics.uci.edu/ml/datasets/Iris

In [2]:
# Read data from CSV file
file_path = os.path.join('data', 'iris.data')
df = pd.read_csv(
    file_path,
    header=None,
    names=['SepalLength', 'SepalWidth',
           'PetalLength', 'PetalWidth', 'class'],
    dtype={'SepalLength': np.float32,
           'SepalWidth': np.float32,
           'PetalLength': np.float32,
           'PetalWidth': np.float32,
           'class': np.str},
)
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Transform Categorical Attribute into Integer Encoding

In [3]:
df['class'].astype('category')

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: class, Length: 150, dtype: category
Categories (3, object): [Iris-setosa, Iris-versicolor, Iris-virginica]

In [4]:
df['class'] = df['class'].astype('category')
df['class'] = df['class'].cat.codes
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Split the Data into Train Set and Test Set

In [5]:
# Shuffle data
n = len(df.index)
shuffled_indices = np.random.permutation(n)
df = df.iloc[shuffled_indices]
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,class
126,6.2,2.8,4.8,1.8,2
105,7.6,3.0,6.6,2.1,2
32,5.2,4.1,1.5,0.1,0
146,6.3,2.5,5.0,1.9,2
76,6.8,2.8,4.8,1.4,1


In [6]:
# Split train/test sets
n = len(df.index)
num_train = int(n * 0.6)
num_test = n - num_train
x_train = df.iloc[:num_train, :4].values
y_train = df.iloc[:num_train, -1].values.astype(np.long)
x_test = df.iloc[-num_test:, :4].values
y_test = df.iloc[-num_test:, -1].values.astype(np.long)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_train.dtype, x_test.dtype)

(90, 4) (90,)
(60, 4) (60,)
float32 float32


## From Numpy Array to PyTorch DataLoader

In [7]:
class NpDataset(Dataset):
    """Convert Numpy array into PyTorch Dataset"""
    def __init__(self, data, label):
        self.data = torch.from_numpy(data)
        self.label = torch.from_numpy(label)
    
    def __getitem__(self, index):
        return self.data[index], self.label[index]
    
    def __len__(self):
        return len(self.data)

In [8]:
# Prepare DataLoader for PyTorch
train_dataset = NpDataset(x_train, y_train)
test_dataset = NpDataset(x_test, y_test)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=False,
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
)
x, y = next(iter(test_dataloader))
print(x.size(), y.size())

torch.Size([60, 4]) torch.Size([60])


## Define the Neural Network Model

In [9]:
class IrisNN(nn.Module):
    def __init__(self):
        super(IrisNN, self).__init__()
        self.fn1 = nn.Linear(4, 6)
        self.fn2 = nn.Linear(6, 3)
    
    def forward(self, x):
        x = F.relu(self.fn1(x))
        x = self.fn2(x)
#         x = F.log_softmax(x, dim=1)
        return x
        
model = IrisNN()

In [10]:
score = model(x[:2])
score

tensor([[ 1.0681, -1.3097,  1.5607],
        [ 0.8679, -1.1022,  1.3850]], grad_fn=<AddmmBackward>)

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

cuda:0


IrisNN(
  (fn1): Linear(in_features=4, out_features=6, bias=True)
  (fn2): Linear(in_features=6, out_features=3, bias=True)
)

## Choosing Loss Function and Optimizer

In [12]:
# loss_fn = nn.NLLLoss()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

## Backpropagation for Updating the Parameters

In [13]:
def l2_reg(params):
    l2 = torch.tensor(0.).to(device)
    for param in params:
        l2 += param.norm()
    return l2

In [14]:
def train():
    model.train()
    
    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)
        n = x.size(0)
        
        optimizer.zero_grad()
        score = model(x)
        loss = loss_fn(score, y) + 0.01*l2_reg(model.parameters())
#         loss = loss_fn(score, y)
        
        
        loss.backward()
        optimizer.step()
        
        predictions = score.max(1, keepdim=True)[1]
        num_correct = predictions.eq(y.view_as(predictions)).sum().item()
    
    acc = num_correct / n
    return loss, acc

## Passing Forward for Evaluation

In [15]:
def evaluate():
    model.eval()
    
    with torch.no_grad():
        for x, y in test_dataloader:
            x = x.to(device)
            y = y.to(device)
            n = x.size(0)
            
            score = model(x)
            loss = loss_fn(score, y)
            predictions = score.max(1, keepdim=True)[1]
            num_correct = predictions.eq(y.view_as(predictions)).sum().item()
    
    acc = num_correct / n
    return loss, acc

## Let's Start Training

In [16]:
seed = 4096
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
            
max_epochs = 100
for epoch in range(max_epochs):
    tr_loss, tr_acc = train()
    eva_loss, eva_acc = evaluate()
    print(f'[{epoch}/{max_epochs}] Train loss: {tr_loss:.4f} acc: {tr_acc*100:.2f} - Test loss: {eva_loss:.4f} acc: {eva_acc*100:.2f}')
    
# [0/100] Train loss: 1.3462 acc: 27.78 - Test loss: 1.2278 acc: 41.67    
# [99/100] Train loss: 0.2368 acc: 97.78 - Test loss: 0.2155 acc: 96.67

[0/100] Train loss: 1.6782 acc: 28.89 - Test loss: 1.3612 acc: 40.00
[1/100] Train loss: 1.6101 acc: 28.89 - Test loss: 1.2876 acc: 40.00
[2/100] Train loss: 1.4964 acc: 28.89 - Test loss: 1.2096 acc: 40.00
[3/100] Train loss: 1.3671 acc: 26.67 - Test loss: 1.1521 acc: 15.00
[4/100] Train loss: 1.2534 acc: 13.33 - Test loss: 1.1364 acc: 1.67
[5/100] Train loss: 1.1810 acc: 0.00 - Test loss: 1.1701 acc: 28.33
[6/100] Train loss: 1.1622 acc: 36.67 - Test loss: 1.2391 acc: 28.33
[7/100] Train loss: 1.1871 acc: 36.67 - Test loss: 1.3121 acc: 28.33
[8/100] Train loss: 1.2287 acc: 36.67 - Test loss: 1.3588 acc: 28.33
[9/100] Train loss: 1.2593 acc: 36.67 - Test loss: 1.3640 acc: 28.33
[10/100] Train loss: 1.2638 acc: 36.67 - Test loss: 1.3302 acc: 28.33
[11/100] Train loss: 1.2426 acc: 36.67 - Test loss: 1.2726 acc: 28.33
[12/100] Train loss: 1.2063 acc: 36.67 - Test loss: 1.2089 acc: 28.33
[13/100] Train loss: 1.1686 acc: 36.67 - Test loss: 1.1528 acc: 28.33
[14/100] Train loss: 1.1387 acc:

## Expected Initial Loss

In [17]:
expect_loss = np.log(3)
expect_loss

1.0986122886681098

## Print parameters
Don't do it on large nerual network!

In [19]:
for p in model.parameters():
    print(p)

# Parameter containing:
# tensor([[-0.3159, -0.4066, -0.2228,  0.3912],
#         [-0.4637, -0.0598,  0.1091, -0.4652],
#         [-0.1581, -0.4901, -0.4268, -0.3081],
#         [ 0.6481,  0.5173, -0.7721, -0.1273],
#         [ 0.6848,  0.0882, -0.2875, -0.6507],
#         [ 0.0285, -0.5087,  1.1340,  0.8644]], device='cuda:0',
#        requires_grad=True)
# Parameter containing:
# tensor([ 0.4440, -0.0441, -0.3559,  0.4312,  0.0881, -0.2388], device='cuda:0',
#        requires_grad=True)
# Parameter containing:
# tensor([[ 0.2153,  0.3459, -0.1325,  0.8138,  0.3868, -0.9468],
#         [ 0.3085,  0.2449, -0.1748,  0.1355,  0.2751,  0.4146],
#         [-0.0276,  0.2333, -0.1011, -0.8747, -0.6208,  1.1263]],
#        device='cuda:0', requires_grad=True)
# Parameter containing:
# tensor([ 0.3609, -0.1801, -0.1377], device='cuda:0', requires_grad=True)

Parameter containing:
tensor([[ 6.4940e-01,  1.6931e-01, -5.1571e-01, -5.1580e-01],
        [ 4.6394e-01,  7.8318e-01, -5.5839e-01, -8.0084e-01],
        [-2.8041e-01, -4.3104e-01, -1.5579e-02, -4.1927e-02],
        [-4.4076e-01,  8.7070e-02,  3.8718e-01, -1.3657e-01],
        [-3.8608e-01, -4.5914e-01,  4.0574e-01,  2.9218e-01],
        [ 7.4699e-02, -3.2551e-01,  1.1816e+00,  9.3453e-04]], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([-0.2536,  0.0536, -0.3241, -0.1278,  0.3421, -0.0803], device='cuda:0',
       requires_grad=True)
Parameter containing:
tensor([[ 0.4034,  0.6951,  0.1387,  0.2628,  0.0015, -0.7449],
        [ 0.1327, -0.1364, -0.3493,  0.3160,  0.2932,  0.4585],
        [-0.7156, -1.0724,  0.3656, -0.1596, -0.2651,  0.7777]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([ 0.1948, -0.1498, -0.0759], device='cuda:0', requires_grad=True)


## Delving into Loss Functions for Classification Problems

Using the toy example from Lecture 3 - Loss Functions and Optimization from Stanford University on YouTube  
Link: https://youtu.be/h7iBpEHGVNc?t=502


- Suppose: 3 training examples within 3 classes
- 1 cat, 1 car, 1 frog

### Breakdown into 4 Steps

1. What
1. When
1. How
1. Why

### Negative Log Likelihood Loss - `NLLLoss`

#### What

- Same as **Sparse Categorical Cross Entropy** in Tensorflow
- The most commonly used loss function for classification problem in neural network

#### When

- Y is in integer encoding. eg: [0, 1, 2, 3]
- Multi-classes
- Single label - Each train sample is assigned to only 1 label.
- Very common, but not the goto loss function in PyTorch

#### How

- Y is in integer encoding. It starts from 0.  
    eg: Y = [0, 2, 1] coresponding to x1 has Class 0, x2 has Class 2, x3 has Class 1.
- Use `log_softmax` function on the output layer
- The # of nodes from output score is same as the # of labels

In [None]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = torch.LongTensor([0, 1, 2])

In [None]:
s = F.log_softmax(score, dim=1)
print(s, '\n')

loss_fn = nn.NLLLoss()
loss = loss_fn(s, y)
print(loss)

#### Why

In [None]:
# rescale arbitary values into probability in range [0, 1]
s = F.softmax(score, dim=1)
print(x, '\n')

# logarithm is a monotonic decrease function. log(x) is negative when 0= < x <= 0.
# So we add negative sign to inverse that.
s = -torch.log(s)
print(x, '\n')

# For each train example, we pick the score based on the index of the label.
# And then compute it's mean.
loss = (s[0][y[0]] + s[1][y[1]] + s[2][y[2]]) / 3
print(loss)

#### Properties

- The expected inital value is `-log(1/num_classes) == log(num_classes)` All nodes from score has the same probability, 1/num_classes.
- Minimal: 0
- Maximal: inf

### Cross Entropy Loss - `CrossEntropyLoss`

#### What

Combines `LogSoftmax` and `NLLoss` into a single function.

#### When

- It applies to the same train set as `NLLoss`.
- When you don't need to output probability of a prediction.

#### How

Same as `NLLoss`, but it does NOT require `log_softmax` activation function

#### Why

- It saves 1 step when computing forward prediction.
- It's easier for testing the resistance of your model against adversarial attacks.
- There's no extra parameters to train, so `CrossEntropyLoss` and `NLLoss` are interchangeable

In [None]:
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(score, y)
print(loss)

### Multi-Class Classification Hinge loss
Hinge loss function for SVM

#### What

#### When

#### How 

#### Why

In [None]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = torch.LongTensor([0, 1, 2])

In [None]:
loss_fn = nn.MultiMarginLoss(p=1, margin=1.0, reduction='none')
loss = loss_fn(score, y)
print(loss)

loss_fn = nn.MultiMarginLoss(p=1, margin=1.0)
loss = loss_fn(score, y)
print(loss)

In [None]:
s_diag = torch.diag(score).unsqueeze(1)
print(s_diag, '\n')

s_y = torch.mm(s_diag, torch.ones(1,3))
margin = score - s_y + 1
margin = margin - torch.eye(3)
print(margin, '\n')

z = torch.zeros(3, 3)
margin = torch.max(margin, z)
print(margin.sum(dim=1), '\n')

loss = margin.sum() / 9
print(loss)

In [None]:
# min
y_mat = -torch.ones(3, 3) + 2*torch.eye(3)
print(y_mat, '\n')

x = 1e10 * y_mat
print(x, '\n')

print(loss_fn(x, y))

In [None]:
x = -1e10 * y_mat
print(x, '\n')

print(loss_fn(x, y))

### Multi-class multi-classification hinge loss
Use this method when a train example can be signed with multiple labels.

In [None]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = torch.LongTensor([0, 1, 2])

In [None]:
y_multi_hinge = torch.LongTensor(range(3)).unsqueeze(dim=1).mm(torch.ones(1, 3, dtype=torch.long))
print(y_multi_hinge, '\n')

loss_fn = nn.MultiLabelMarginLoss()
loss = loss_fn(score, y_multi_hinge)
print(loss)

In [None]:
s_y = torch.mm(torch.diag(score).unsqueeze(1), torch.ones(1,3))
print(s_y, '\n')

s_y = torch.mm(s_diag, torch.ones(1,3))
margin = score - s_y + 1
margin = margin - torch.eye(3)
print(margin, '\n')

z = torch.zeros(3, 3)
margin = torch.max(margin, z)
print(margin.sum(dim=1), '\n')

loss = margin.sum() / 3
print(loss)

### Binary Cross Entropy Loss
Using this method when the last layer is `Sigmoid` function and labels are using **one-hot encoding**.

In [None]:
x = torch.sigmoid(score)
print(x, '\n')
# Binary Cross Entropy Loss
y_oh = torch.eye(3)
print(y_oh, '\n')
loss_fn = nn.BCELoss()
loss = loss_fn(x, y_oh)
print(loss)

In [None]:
loss = -(y_oh*torch.log(x) + (1-y_oh)*torch.log(1-x))
loss = loss.mean()
print(loss)

### Binary Cross Entropy Loss with Logits
Combine the `Sigmoid` layer and the `BCELoss` into one function.

In [None]:
loss_fn = nn.BCEWithLogitsLoss()
loss = loss_fn(score, y_oh)
print(loss)

### Soft Margin Loss
- It's similar to hinge loss but with smooth curve.
- Adventage: Differentiable!
- y in {-1, 1}

In [None]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = -torch.ones(3, 3) + 2*torch.eye(3)
print(y)

In [None]:
loss_fn = nn.SoftMarginLoss()
loss = loss_fn(score, y)
print(loss)

In [None]:
torch.log(1 + torch.exp(-y * score))

In [None]:
torch.log(1 + torch.exp(-y * score)).mean()

In [None]:
# min
x = 1e10 * y
print(x, '\n')
loss_fn(x, y)

In [None]:
# max
x = -1e10 * y
print(x, '\n')
loss_fn(x, y)

### Multi-Label Soft Margin Loss

- Multi-label one-versus-all
- y in {0, 1} 

In [None]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = torch.eye(3)
print(y)

In [None]:
loss_fn = nn.MultiLabelSoftMarginLoss()
loss = loss_fn(score, y)
print(loss)

In [None]:
# min
x = 1e10 * (-torch.ones(3) + 2*torch.eye(3))
print(x, '\n')
loss_fn(x, y)

In [None]:
# max
x = -1e10 * (-torch.ones(3) + 2*torch.eye(3))
loss_fn(x, y)