# Resuming the training on an already trained model

Notebook for performs training across multiple sessions, resuming the previous saved state at each session.

In [1]:
#importing essential modules
import pandas as pd
import numpy as np
from random import seed, choice
import os
import matplotlib.pyplot as plt
from os import path

In [2]:
#Setting train, validation and test set target folder
#target folder - train
trainfolder = "../data/food-101/train_img/"

#target folder - validation
validfolder = "../data/food-101/valid_img/"

#target folder -test
testfolder = "../data/food-101/test_img/"

> ### Transformers set-up for train, validation and test data

In [3]:
#Norm values
norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]

#Img size parameters
img_short_side_resize = 256
img_input_size = 224

import os
from torchvision import datasets
import torchvision.transforms as transforms


transform_train = transforms.Compose([
                    transforms.Resize(img_short_side_resize),
                    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomResizedCrop(img_input_size, scale=(0.08,1), ratio=(1,1)), 
                    transforms.ToTensor(),
                    transforms.Normalize(mean = norm_mean, std = norm_std)])
transform_test = transforms.Compose([
                    transforms.Resize(img_input_size),  
                    transforms.FiveCrop(img_input_size),
                    transforms.Lambda(lambda crops: torch.stack([transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize(mean = norm_mean, std = norm_std)])(crop) for crop in crops]))])

train_data = datasets.ImageFolder(trainfolder, transform_train)
valid_data = datasets.ImageFolder(validfolder, transform_test)
test_data = datasets.ImageFolder(testfolder, transform_test)

> ### Data loaders

In [4]:
import torch

shuffle = True
num_workers = 16
batch_size = 64

# Create the data loaders
data = {"train" : train_data, "val":valid_data, "test" : test_data}

train_loader = torch.utils.data.DataLoader(data["train"], batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=True)

#### --- NOTE on num_workers if using 5crop and batch_size for testing --- ###
# If using the 5crop test time augmentation, num_workers = 0 (an error is raised otherwise) 
# batch_size needs to be reduced during testing due to memory requirements
valid_loader = torch.utils.data.DataLoader(data["val"], batch_size=int(np.floor(batch_size/5)), num_workers=0, shuffle=shuffle, pin_memory=True)

test_loader = torch.utils.data.DataLoader(data["test"], batch_size=int(np.floor(batch_size/5)), num_workers=0, shuffle=shuffle, pin_memory=True)
loaders_transfer = {"train" : train_loader, "val":valid_loader, "test" : test_loader}

> ### CPU vs GPU 

In [5]:
#setting CPU vs GPU

use_cuda = torch.cuda.is_available()
if not use_cuda:
    print('CUDA is not available.  Training on CPU ...')
    device = "cpu:1"
else:
    print('CUDA is available!  Training on GPU ...')
    device = torch.device("cuda")
    print("Using",torch.cuda.get_device_name(device))

CUDA is available!  Training on GPU ...
Using GeForce GTX 960M


> ### Loading trained ResNet for prosecuting learning

In [6]:
import torchvision.models as models
import torch.nn as nn

model_transfer = models.resnet50(pretrained=True)

#freezing the parameters
for param in model_transfer.parameters():
    param.requires_grad = False
    
n_classes = len(train_data.classes)

# Replacing the last layer with a fully connected layer to retrain
model_transfer.fc = nn.Linear(model_transfer.fc.in_features,n_classes)

# Initialize the weights of the new layer
nn.init.kaiming_normal_(model_transfer.fc.weight, nonlinearity='relu')

# Transfer to GPU 
model_transfer = model_transfer.to(device)

In [7]:
#specify loss function and optimizer
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

criterion_transfer = nn.CrossEntropyLoss()
optimizer_transfer = optim.Adam(model_transfer.parameters(),3e-4) 
scheduler_transfer = ReduceLROnPlateau(optimizer_transfer, 'min', verbose = True, factor = 0.5, patience = 7)

In [8]:
from pytorch_utils import load_checkpoint

model_retr, optimizer, scheduler, start_epoch, losslogger, train_loss_tracker, valid_loss_tracker = load_checkpoint(model_transfer, optimizer_transfer, scheduler_transfer, criterion_transfer)
model_retr = model_retr.to(device)

# now individually transfer the optimizer parts...
for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.to(device)

=> loading checkpoint 'models/model_res_101cat.pt.tar'
=> loaded checkpoint 'models/model_res_101cat.pt.tar' (epoch 131)
Starting from the best model trained so far (based on validation results)


> ## Training

In [9]:
from pytorch_utils import train

# train the model
n_epochs = 8

nc = 101

model_resumed = train(n_epochs, 
                       loaders_transfer, 
                       model_retr, 
                       optimizer_transfer, 
                       criterion_transfer, 
                       device, 
                       'models/model_res_{}cat.pt'.format(nc), 
                       fivecrop = "mean", 
                       lr_scheduler = scheduler_transfer,
                       valid_loss_min = losslogger,
                       start_epoch = start_epoch + 1,
                     train_loss=train_loss_tracker,
                     valid_loss=valid_loss_tracker)

Epoch 132 done in 42.0 minutes and 51 seconds at 0:38:19. 	Training Loss: 1.721 	Validation Loss: 1.220
Epoch 133 done in 46.0 minutes and 45 seconds at 1:25:4. 	Training Loss: 1.725 	Validation Loss: 1.217
Last epoch was not saved in {} since both validation and training loss didn't improve.
Epoch 134 done in 45.0 minutes and 50 seconds at 2:10:54. 	Training Loss: 1.733 	Validation Loss: 1.193
Last epoch was not saved in {} since both validation and training loss didn't improve.
Epoch 135 done in 45.0 minutes and 36 seconds at 2:56:30. 	Training Loss: 1.719 	Validation Loss: 1.204
Epoch 136 done in 44.0 minutes and 34 seconds at 3:41:4. 	Training Loss: 1.730 	Validation Loss: 1.194
Last epoch was not saved in {} since both validation and training loss didn't improve.
Epoch 137 done in 45.0 minutes and 25 seconds at 4:26:29. 	Training Loss: 1.722 	Validation Loss: 1.202
Last epoch was not saved in {} since both validation and training loss didn't improve.
Epoch 138 done in 43.0 minutes

# 