# Validation dataset

목적: overfitting을 방지하기 위한 방편으로 validation dataset을 사용하는 torch코드를 정리한다. (keras를 이용하면 더 손쉽게 동일 목적을 달성할 수 있다.)

validation dataset 분리: torch.utils.data.random_split를 이용하여 train dataset의 일부를 validation dataset으로 떼어둔다.

epochs: 충분히 긴 epochs를 이용하여 validation loss가 saturation되는 epochs를 확인하고, 이 값을 가지고 다시 학습을 진행한다.

결과: 마지막으로 test datasets에서 결과를 검토하였다. validation loss와 비슷한 test loss를 확일할 수 있다. 단, 일반적으로 test datasets에 대한 결과를 알 수 없다는 점을 염두에 두어야 한다.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
img_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train = datasets.MNIST(
    './data', train=True, download=True,
    transform=img_transforms)

validation_split = 0.3
lengths = [int(len(train) * validation_split)]
lengths.insert(0, len(train) - lengths[0])
lengths, sum(lengths)

([42000, 18000], 60000)

In [3]:
BATCH_SIZE = 128

train, val = torch.utils.data.random_split(
    train,
    lengths=lengths
)

train_loader = DataLoader(
    train, batch_size=BATCH_SIZE, shuffle=True
)
val_loader = DataLoader(
    val, batch_size=BATCH_SIZE, shuffle=False
)
test_loader = DataLoader(
    datasets.MNIST(
        './data', train=False, download=True,
        transform=img_transforms),
    batch_size=BATCH_SIZE, shuffle=False
)

### model and evaluation from 

https://github.com/pytorch/examples/blob/master/mnist/main.py

In [4]:
# https://github.com/pytorch/examples/blob/master/mnist/main.py

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

In [5]:
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
            
    test_loss /= len(test_loader.dataset)
    correct /= len(test_loader.dataset)
    
    return test_loss, correct

## train

the negative log-likelihood loss

https://ljvmiranda921.github.io/notebook/2017/08/13/softmax-and-the-negative-log-likelihood/

In [6]:
import numpy as np
import pandas as pd
import altair as alt

In [7]:
EPOCHS = 30
lr = 1e-4

model = Net().to(device)
criterion = nn.NLLLoss(reduction='sum')  # negative log likelihood loss
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=lr,
    weight_decay=1e-5
)

history = dict(train=[], val=[])

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        
    train_loss /= len(train_loader.dataset)
    val_loss, val_correct = test(model, device, val_loader)
    history['train'].append(train_loss)
    history['val'].append(val_loss)
    
    print(f'epoch: {epoch + 1:3d}, loss: {train_loss:.4f},'
          f' val_loss: {val_loss:.4f}, val_correct: {val_correct:.4f}')

epoch:   1, loss: 0.5842, val_loss: 0.1998, val_correct: 0.9416
epoch:   2, loss: 0.2357, val_loss: 0.1252, val_correct: 0.9628
epoch:   3, loss: 0.1651, val_loss: 0.0883, val_correct: 0.9740
epoch:   4, loss: 0.1287, val_loss: 0.0708, val_correct: 0.9782
epoch:   5, loss: 0.1081, val_loss: 0.0615, val_correct: 0.9809
epoch:   6, loss: 0.0949, val_loss: 0.0564, val_correct: 0.9820
epoch:   7, loss: 0.0864, val_loss: 0.0518, val_correct: 0.9841
epoch:   8, loss: 0.0760, val_loss: 0.0488, val_correct: 0.9847
epoch:   9, loss: 0.0708, val_loss: 0.0463, val_correct: 0.9858
epoch:  10, loss: 0.0642, val_loss: 0.0458, val_correct: 0.9858
epoch:  11, loss: 0.0578, val_loss: 0.0423, val_correct: 0.9868
epoch:  12, loss: 0.0549, val_loss: 0.0412, val_correct: 0.9868
epoch:  13, loss: 0.0518, val_loss: 0.0410, val_correct: 0.9868
epoch:  14, loss: 0.0484, val_loss: 0.0399, val_correct: 0.9874
epoch:  15, loss: 0.0438, val_loss: 0.0379, val_correct: 0.9879
epoch:  16, loss: 0.0423, val_loss: 0.03

In [8]:
import altair as alt

df = pd.concat([
    pd.DataFrame({
        'dataset': ['train'] * len(history['train']),
        'loss': history['train']
    }).reset_index(),
    pd.DataFrame({
        'dataset': ['val'] * len(history['train']),
        'loss': history['val']
    }).reset_index(),
], ignore_index=True)

# df

In [12]:
chart = alt.Chart(df).mark_line().encode(
    x='index',
    y='loss',
    color='dataset'
)
chart

In [17]:
# github won't render altair chart, so I save and load the history image.
chart.save('mnist_split_history.png')

![train loss history](mnist_split_history.png)

## Final retrain, epochs = 8

In [10]:
EPOCHS = 8
lr = 1e-4

model = Net().to(device)
criterion = nn.NLLLoss(reduction='sum')  # negative log likelihood loss
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=lr,
    weight_decay=1e-5
)

history = dict(train=[], val=[])

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        
    train_loss /= len(train_loader.dataset)
    val_loss, val_correct = test(model, device, val_loader)
    history['train'].append(train_loss)
    history['val'].append(val_loss)
    
    print(f'epoch: {epoch + 1:3d}, loss: {train_loss:.4f},'
          f' val_loss: {val_loss:.4f}, val_correct: {val_correct:.4f}')

epoch:   1, loss: 0.6252, val_loss: 0.2356, val_correct: 0.9292
epoch:   2, loss: 0.2604, val_loss: 0.1410, val_correct: 0.9581
epoch:   3, loss: 0.1830, val_loss: 0.1022, val_correct: 0.9685
epoch:   4, loss: 0.1417, val_loss: 0.0765, val_correct: 0.9772
epoch:   5, loss: 0.1167, val_loss: 0.0656, val_correct: 0.9794
epoch:   6, loss: 0.1032, val_loss: 0.0600, val_correct: 0.9811
epoch:   7, loss: 0.0892, val_loss: 0.0540, val_correct: 0.9836
epoch:   8, loss: 0.0788, val_loss: 0.0499, val_correct: 0.9849


In [11]:
test_loss, test_correct = test(model, device, test_loader)
print(f'test_loss: {test_loss}, test_correct: {test_correct}')

test_loss: 0.04348286626338959, test_correct: 0.9847
