# Playground for PyTorch Loss Functions

In [1]:
import os

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

## Import data from a CSV file using Pandas
Download link: https://archive.ics.uci.edu/ml/datasets/Iris

In [2]:
# Read data from CSV file
file_path = os.path.join('data', 'iris.data')
df = pd.read_csv(
    file_path,
    header=None,
    names=['SepalLength', 'SepalWidth',
           'PetalLength', 'PetalWidth', 'class'],
    dtype={'SepalLength': np.float32,
           'SepalWidth': np.float32,
           'PetalLength': np.float32,
           'PetalWidth': np.float32,
           'class': np.str},
)
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Transform Categorical Attribute into Integer Encoding

In [3]:
df['class'].astype('category')

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: class, Length: 150, dtype: category
Categories (3, object): [Iris-setosa, Iris-versicolor, Iris-virginica]

In [4]:
df['class'] = df['class'].astype('category')
df['class'] = df['class'].cat.codes
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Split the Data into Train Set and Test Set

In [5]:
# Shuffle data
n = len(df.index)
shuffled_indices = np.random.permutation(n)
df = df.iloc[shuffled_indices]
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,class
71,6.1,2.8,4.0,1.3,1
60,5.0,2.0,3.5,1.0,1
96,5.7,2.9,4.2,1.3,1
9,4.9,3.1,1.5,0.1,0
51,6.4,3.2,4.5,1.5,1


In [6]:
# Split train/test sets
n = len(df.index)
num_train = int(n * 0.6)
num_test = n - num_train
x_train = df.iloc[:num_train, :4].values
y_train = df.iloc[:num_train, -1].values.astype(np.long)
x_test = df.iloc[-num_test:, :4].values
y_test = df.iloc[-num_test:, -1].values.astype(np.long)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_train.dtype, x_test.dtype)

(90, 4) (90,)
(60, 4) (60,)
float32 float32


## From Numpy Array to PyTorch DataLoader

In [7]:
class NpDataset(Dataset):
    """Convert Numpy array into PyTorch Dataset"""
    def __init__(self, data, label):
        self.data = torch.from_numpy(data)
        self.label = torch.from_numpy(label)
    
    def __getitem__(self, index):
        return self.data[index], self.label[index]
    
    def __len__(self):
        return len(self.data)

In [8]:
# Prepare DataLoader for PyTorch
train_dataset = NpDataset(x_train, y_train)
test_dataset = NpDataset(x_test, y_test)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=False,
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
)
x, y = next(iter(test_dataloader))
print(x.size(), y.size())

torch.Size([60, 4]) torch.Size([60])


## Define the Neural Network Model

In [9]:
class IrisNN(nn.Module):
    def __init__(self):
        super(IrisNN, self).__init__()
        self.fn1 = nn.Linear(4, 6)
        self.fn2 = nn.Linear(6, 3)
    
    def forward(self, x):
        x = F.relu(self.fn1(x))
        x = self.fn2(x)
#         x = F.log_softmax(x, dim=1)
        return x
        
model = IrisNN()

In [10]:
score = model(x[:2])
score

tensor([[ 0.1231, -1.0510,  1.0529],
        [ 0.2989, -1.3437,  1.1933]], grad_fn=<AddmmBackward>)

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

cuda:0


IrisNN(
  (fn1): Linear(in_features=4, out_features=6, bias=True)
  (fn2): Linear(in_features=6, out_features=3, bias=True)
)

## Choosing Loss Function and Optimizer

In [12]:
# loss_fn = nn.NLLLoss()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

## Backpropagation for Updating the Parameters

In [13]:
def train():
    model.train()
    
    for x, y in train_dataloader:
        x = x.to(device)
        y = y.to(device)
        n = x.size(0)
        
        optimizer.zero_grad()
        score = model(x)
        loss = loss_fn(score, y)
        
        loss.backward()
        optimizer.step()
        
        predictions = score.max(1, keepdim=True)[1]
        num_correct = predictions.eq(y.view_as(predictions)).sum().item()
    
    acc = num_correct / n
    return loss, acc

## Passing Forward for Evaluation

In [14]:
def evaluate():
    model.eval()
    
    with torch.no_grad():
        for x, y in test_dataloader:
            x = x.to(device)
            y = y.to(device)
            n = x.size(0)
            
            score = model(x)
            loss = loss_fn(score, y)
            predictions = score.max(1, keepdim=True)[1]
            num_correct = predictions.eq(y.view_as(predictions)).sum().item()
    
    acc = num_correct / n
    return loss, acc

## Let's Start Training

In [15]:
seed = 4096
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
            
max_epochs = 100
for epoch in range(max_epochs):
    tr_loss, tr_acc = train()
    eva_loss, eva_acc = evaluate()
    print(f'[{epoch}/{max_epochs}] Train loss: {tr_loss:.4f} acc: {tr_acc*100:.2f} - Test loss: {eva_loss:.4f} acc: {eva_acc*100:.2f}')
    
# [0/100] Train loss: 1.3462 acc: 27.78 - Test loss: 1.2278 acc: 41.67    
# [99/100] Train loss: 0.2368 acc: 97.78 - Test loss: 0.2155 acc: 96.67

[0/100] Train loss: 1.5639 acc: 26.67 - Test loss: 1.2292 acc: 43.33
[1/100] Train loss: 1.4761 acc: 26.67 - Test loss: 1.1796 acc: 43.33
[2/100] Train loss: 1.3510 acc: 26.67 - Test loss: 1.1676 acc: 18.33
[3/100] Train loss: 1.2470 acc: 8.89 - Test loss: 1.2118 acc: 5.00
[4/100] Train loss: 1.1975 acc: 4.44 - Test loss: 1.2992 acc: 26.67
[5/100] Train loss: 1.2015 acc: 37.78 - Test loss: 1.3967 acc: 26.67
[6/100] Train loss: 1.2342 acc: 37.78 - Test loss: 1.4681 acc: 26.67
[7/100] Train loss: 1.2642 acc: 37.78 - Test loss: 1.4903 acc: 26.67
[8/100] Train loss: 1.2701 acc: 37.78 - Test loss: 1.4599 acc: 26.67
[9/100] Train loss: 1.2468 acc: 37.78 - Test loss: 1.3903 acc: 26.67
[10/100] Train loss: 1.2032 acc: 37.78 - Test loss: 1.3021 acc: 26.67
[11/100] Train loss: 1.1540 acc: 37.78 - Test loss: 1.2145 acc: 26.67
[12/100] Train loss: 1.1122 acc: 37.78 - Test loss: 1.1400 acc: 23.33
[13/100] Train loss: 1.0846 acc: 37.78 - Test loss: 1.0837 acc: 11.67
[14/100] Train loss: 1.0716 acc: 

## Expected Initial Loss

In [16]:
expect_loss = np.log(3)
expect_loss

1.0986122886681098

## Delving into Loss Functions for Classification Problems

Using the toy example from Lecture 3 - Loss Functions and Optimization from Stanford University on YouTube  
Link: https://youtu.be/h7iBpEHGVNc?t=502


- Suppose: 3 training examples within 3 classes
- 1 cat, 1 car, 1 frog

### Multi-Class Classification Hinge loss
Hinge loss function for SVM

In [64]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = torch.LongTensor([0, 1, 2])

In [65]:
loss_fn = nn.MultiMarginLoss(p=1, margin=1.0, reduction='none')
loss = loss_fn(score, y)
print(loss)

loss_fn = nn.MultiMarginLoss(p=1, margin=1.0)
loss = loss_fn(score, y)
print(loss)

tensor([0.9667, 0.0000, 4.3000])
tensor(1.7556)


In [66]:
s_diag = torch.diag(score).unsqueeze(1)
print(s_diag, '\n')

s_y = torch.mm(s_diag, torch.ones(1,3))
margin = score - s_y + 1
margin = margin - torch.eye(3)
print(margin, '\n')

z = torch.zeros(3, 3)
margin = torch.max(margin, z)
print(margin.sum(dim=1), '\n')

loss = margin.sum() / 9
print(loss)

tensor([[ 3.2000],
        [ 4.9000],
        [-3.1000]]) 

tensor([[ 0.0000,  2.9000, -3.9000],
        [-2.6000,  0.0000, -1.9000],
        [ 6.3000,  6.6000,  0.0000]]) 

tensor([ 2.9000,  0.0000, 12.9000]) 

tensor(1.7556)


In [68]:
# min
y_mat = -torch.ones(3, 3) + 2*torch.eye(3)
print(y_mat, '\n')

x = 1e10 * y_mat
print(x, '\n')

print(loss_fn(x, y))

tensor([[ 1., -1., -1.],
        [-1.,  1., -1.],
        [-1., -1.,  1.]]) 

tensor([[ 1.0000e+10, -1.0000e+10, -1.0000e+10],
        [-1.0000e+10,  1.0000e+10, -1.0000e+10],
        [-1.0000e+10, -1.0000e+10,  1.0000e+10]]) 

tensor(0.)


In [69]:
x = -1e10 * y_mat
print(x, '\n')

print(loss_fn(x, y))

tensor([[-1.0000e+10,  1.0000e+10,  1.0000e+10],
        [ 1.0000e+10, -1.0000e+10,  1.0000e+10],
        [ 1.0000e+10,  1.0000e+10, -1.0000e+10]]) 

tensor(1.3333e+10)


### Multi-class multi-classification hinge loss
Use this method when a train example can be signed with multiple labels.

In [35]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = torch.LongTensor([0, 1, 2])

In [26]:
y_multi_hinge = torch.LongTensor(range(3)).unsqueeze(dim=1).mm(torch.ones(1, 3, dtype=torch.long))
print(y_multi_hinge, '\n')

loss_fn = nn.MultiLabelMarginLoss()
loss = loss_fn(score, y_multi_hinge)
print(loss)

tensor([[0, 0, 0],
        [1, 1, 1],
        [2, 2, 2]]) 

tensor(5.2667)


In [28]:
s_y = torch.mm(torch.diag(score).unsqueeze(1), torch.ones(1,3))
print(s_y, '\n')

s_y = torch.mm(s_diag, torch.ones(1,3))
margin = score - s_y + 1
margin = margin - torch.eye(3)
print(margin, '\n')

z = torch.zeros(3, 3)
margin = torch.max(margin, z)
print(margin.sum(dim=1), '\n')

loss = margin.sum() / 3
print(loss)

tensor([[ 3.2000,  3.2000,  3.2000],
        [ 4.9000,  4.9000,  4.9000],
        [-3.1000, -3.1000, -3.1000]]) 

tensor([[ 0.0000,  2.9000, -3.9000],
        [-2.6000,  0.0000, -1.9000],
        [ 6.3000,  6.6000,  0.0000]]) 

tensor([ 2.9000,  0.0000, 12.9000]) 

tensor(5.2667)


### Negative Log Likelihood Loss

In [36]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = torch.LongTensor([0, 1, 2])

In [37]:
s = F.log_softmax(score, dim=1)
print(s, '\n')
loss_fn = nn.NLLLoss()
loss = loss_fn(s, y)
print(loss)

tensor([[-2.0404, -0.1404, -6.9404],
        [-3.6791, -0.0791, -2.9791],
        [-0.8565, -0.5565, -6.1565]]) 

tensor(2.7587)


In [39]:
x = F.softmax(score, dim=1)
print(x, '\n')
x = -torch.log(x)
print(x, '\n')
loss = (x[0][0] + x[1][1] + x[2][2]) / 3
print(loss)

tensor([[0.1300, 0.8690, 0.0010],
        [0.0252, 0.9239, 0.0508],
        [0.4247, 0.5732, 0.0021]]) 

tensor([[2.0404, 0.1404, 6.9404],
        [3.6791, 0.0791, 2.9791],
        [0.8565, 0.5565, 6.1565]]) 

tensor(2.7587)


### Cross Entropy Loss
Combines `LogSoftmax` and `NLLoss` into a single function.

In [43]:
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(score, y)
print(loss)

tensor(2.7587)


### Binary Cross Entropy Loss
Using this method when the last layer is `Sigmoid` function and labels are using **one-hot encoding**.

In [48]:
x = torch.sigmoid(score)
print(x, '\n')
# Binary Cross Entropy Loss
y_oh = torch.eye(3)
print(y_oh, '\n')
loss_fn = nn.BCELoss()
loss = loss_fn(x, y_oh)
print(loss)

tensor([[0.9608, 0.9939, 0.1545],
        [0.7858, 0.9926, 0.8808],
        [0.9002, 0.9241, 0.0431]]) 

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]) 

tensor(1.8908)


In [49]:
loss = -(y_oh*torch.log(x) + (1-y_oh)*torch.log(1-x))
loss = loss.mean()
print(loss)

tensor(1.8908)


### Binary Cross Entropy Loss with Logits
Combine the `Sigmoid` layer and the `BCELoss` into one function.

In [52]:
loss_fn = nn.BCEWithLogitsLoss()
loss = loss_fn(score, y_oh)
print(loss)

tensor(1.8908)


### Soft Margin Loss
- It's similar to hinge loss but with smooth curve.
- Adventage: Differentiable!
- y in {-1, 1}

In [77]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = -torch.ones(3, 3) + 2*torch.eye(3)
print(y)

tensor([[ 1., -1., -1.],
        [-1.,  1., -1.],
        [-1., -1.,  1.]])


In [78]:
loss_fn = nn.SoftMarginLoss()
loss = loss_fn(score, y)
print(loss)

tensor(1.8908)


In [79]:
torch.log(1 + torch.exp(-y * score))

tensor([[0.0400, 5.1061, 0.1678],
        [1.5410, 0.0074, 2.1269],
        [2.3051, 2.5789, 3.1441]])

In [80]:
torch.log(1 + torch.exp(-y * score)).mean()

tensor(1.8908)

In [89]:
# min
x = 1e10 * y
print(x, '\n')
loss_fn(x, y)

tensor([[ 1.0000e+10, -1.0000e+10, -1.0000e+10],
        [-1.0000e+10,  1.0000e+10, -1.0000e+10],
        [-1.0000e+10, -1.0000e+10,  1.0000e+10]]) 



tensor(0.)

In [88]:
# max
x = -1e10 * y
print(x, '\n')
loss_fn(x, y)

tensor([[-1.0000e+10,  1.0000e+10,  1.0000e+10],
        [ 1.0000e+10, -1.0000e+10,  1.0000e+10],
        [ 1.0000e+10,  1.0000e+10, -1.0000e+10]]) 



tensor(inf)

### Multi-Label Soft Margin Loss

- Multi-label one-versus-all
- y in {0, 1} 

In [70]:
score = torch.FloatTensor([
        [3.2, 5.1, -1.7],
        [1.3, 4.9, 2],
        [2.2, 2.5, -3.1],
])
y = torch.eye(3)
print(y)

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])


In [71]:
loss_fn = nn.MultiLabelSoftMarginLoss()
loss = loss_fn(score, y)
print(loss)

tensor(1.8908)


In [74]:
# min
x = 1e10 * (-torch.ones(3) + 2*torch.eye(3))
print(x, '\n')
loss_fn(x, y)

tensor([[ 1.0000e+10, -1.0000e+10, -1.0000e+10],
        [-1.0000e+10,  1.0000e+10, -1.0000e+10],
        [-1.0000e+10, -1.0000e+10,  1.0000e+10]]) 



tensor(0.)

In [75]:
# max
x = -1e10 * (-torch.ones(3) + 2*torch.eye(3))
loss_fn(x, y)

tensor(1.0000e+10)