# Resuming the training on an already trained model

After 150 epoch of Sagemaker training (almost 3 days long), the model reached an accuracy of 66% on the full test set.  
This is an attempt of continuing the training locally adding more epochs.

In [1]:
#importing essential modules
import pandas as pd
import numpy as np
from random import seed, choice
import os
import matplotlib.pyplot as plt
from os import path

In [2]:
#Setting train, validation and test set target folder
#target folder - train
trainfolder = "../data/food-101/train_img/"

#target folder - validation
validfolder = "../data/food-101/valid_img/"

#target folder -test
testfolder = "../data/food-101/test_img/"

> ### Transformers set-up for train, validation and test data

In [3]:
#Norm values
norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]

#Img size parameters
img_short_side_resize = 256
img_input_size = 224

import os
from torchvision import datasets
import torchvision.transforms as transforms


transform_train = transforms.Compose([
                    transforms.Resize(img_short_side_resize),
                    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
                    transforms.RandomHorizontalFlip(),
                    transforms.RandomResizedCrop(img_input_size, scale=(0.08,1), ratio=(1,1)), 
                    transforms.ToTensor(),
                    transforms.Normalize(mean = norm_mean, std = norm_std)])
transform_test = transforms.Compose([
                    transforms.Resize(img_input_size),  
                    transforms.FiveCrop(img_input_size),
                    transforms.Lambda(lambda crops: torch.stack([transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize(mean = norm_mean, std = norm_std)])(crop) for crop in crops]))])

train_data = datasets.ImageFolder(trainfolder, transform_train)
valid_data = datasets.ImageFolder(validfolder, transform_test)
test_data = datasets.ImageFolder(testfolder, transform_test)

> ### Data loaders

In [4]:
import torch

shuffle = True
num_workers = 16
batch_size = 64

# Create the data loaders
data = {"train" : train_data, "val":valid_data, "test" : test_data}

train_loader = torch.utils.data.DataLoader(data["train"], batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=True)

#### --- NOTE on num_workers if using 5crop and batch_size for testing --- ###
# If using the 5crop test time augmentation, num_workers = 0 (an error is raised otherwise) 
# batch_size needs to be reduced during testing due to memory requirements
valid_loader = torch.utils.data.DataLoader(data["val"], batch_size=int(np.floor(batch_size/5)), num_workers=0, shuffle=shuffle, pin_memory=True)

test_loader = torch.utils.data.DataLoader(data["test"], batch_size=int(np.floor(batch_size/5)), num_workers=0, shuffle=shuffle, pin_memory=True)
loaders_transfer = {"train" : train_loader, "val":valid_loader, "test" : test_loader}

> ### CPU vs GPU 

In [5]:
#setting CPU vs GPU

use_cuda = torch.cuda.is_available()
if not use_cuda:
    print('CUDA is not available.  Training on CPU ...')
    device = "cpu:1"
else:
    print('CUDA is available!  Training on GPU ...')
    device = torch.device("cuda")
    print("Using",torch.cuda.get_device_name(device))

CUDA is available!  Training on GPU ...
Using GeForce GTX 960M


> ### Loading trained ResNet for prosecuting learning

In [6]:
import torchvision.models as models
import torch.nn as nn

model_transfer = models.resnet50(pretrained=True)

#freezing the parameters
for param in model_transfer.parameters():
    param.requires_grad = False
    
n_classes = len(train_data.classes)

# Replacing the last layer with a fully connected layer to retrain
model_transfer.fc = nn.Linear(model_transfer.fc.in_features,n_classes)

# Initialize the weights of the new layer
nn.init.kaiming_normal_(model_transfer.fc.weight, nonlinearity='relu')

# Transfer to GPU 
model_transfer = model_transfer.to(device)

In [7]:
#specify loss function and optimizer
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

criterion_transfer = nn.CrossEntropyLoss()
optimizer_transfer = optim.Adam(model_transfer.parameters(),3e-4) 
scheduler_transfer = ReduceLROnPlateau(optimizer_transfer, 'min', verbose = True, factor = 0.5, patience = 7)

In [8]:
def load_checkpoint(model, optimizer, scheduler, losslogger, filename='models/model_res_101cat.pt.tar'):
    # Note: Input model & optimizer should be pre-defined.  This routine only updates their states.
    start_epoch = 0
    if os.path.isfile(filename):
        print("=> loading checkpoint '{}'".format(filename))
        checkpoint = torch.load(filename)
        
        start_epoch = checkpoint['epoch']
        
        model.load_state_dict(checkpoint['model_state_dict'])
        
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        
        scheduler.load_state_dict(checkpoint['scheduler'])
        
        losslogger = checkpoint['valid_loss_min']
        print("=> loaded checkpoint '{}' (epoch {})"
                  .format(filename, checkpoint['epoch']))
        
        train_loss_tracker = checkpoint['train_loss']
        valid_loss_tracker = checkpoint['valid_loss']
        
        if checkpoint['is_best']:
            print("Starting from the best model trained so far (based on validation results)")
        else:
            print("Not starting from the best model trained so far (based on validation results)")
        
    else:
        print("=> no checkpoint found at '{}'".format(filename))

    return model, optimizer, scheduler, start_epoch, losslogger, train_loss_tracker, valid_loss_tracker

In [9]:
model_retr, optimizer, scheduler, start_epoch, losslogger, train_loss_tracker, valid_loss_tracker = load_checkpoint(model_transfer, optimizer_transfer, scheduler_transfer, criterion_transfer)
model_retr = model_retr.to(device)

# now individually transfer the optimizer parts...
for state in optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.to(device)

=> loading checkpoint 'models/model_resumed_101cat.pt.tar'
=> loaded checkpoint 'models/model_resumed_101cat.pt.tar' (epoch 82)


> ### Training function implementation

In [10]:
import time
import datetime

def train_epoch(model,train_loader,optimizer,criterion,device):
    """
    train steps at each epoch
    """
    train_loss = 0.0
    
    model.train()
    
    for batch_idx, (data, target) in enumerate(train_loader):
        
        data, target = data.to(device), target.to(device) # move to GPU
        
        optimizer.zero_grad() # set gradients to 0
        
        output = model(data) # get output
        
        loss = criterion(output, target) # calculate loss
        train_loss += loss.item() * data.size(0)
        
        loss.backward() # calculate gradients
        
        optimizer.step() # take step
        
    train_loss = train_loss / len(train_loader.dataset)
    return model, train_loss
        
def valid_epoch(model, valid_loader, criterion, device, fivecrop):
    """
    validation prediction steps at each epoch
    """
    valid_loss = 0.0
    
    model.eval()
    
    with torch.no_grad():
        for data, target in valid_loader:
            
            data, target = data.to(device), target.to(device) # move to GPU
            
            # if we do test time augmentation with 5crop we'll have an extra dimension in our tensor
            if fivecrop == "mean":
                bs, ncrops, c, h, w = data.size()
                output = model(data.view(-1, c, h, w)) # fuse batch size and ncrops
                output = output.view(bs, ncrops, -1).mean(1)
            elif fivecrop == "max":
                bs, ncrops, c, h, w = data.size()
                output = model(data.view(-1, c, h, w)) # fuse batch size and ncrops
                output = output.view(bs, ncrops, -1).max(1)[0]
            else:
                output = model(data)
                
            ## update the average validation loss
            loss = criterion(output, target)
            valid_loss += loss.item() * data.size(0)
            
    valid_loss = valid_loss / len(valid_loader.dataset) 
    return valid_loss


def train(n_epochs, loaders, model, optimizer, criterion, device, path_model, fivecrop = None, lr_scheduler = None, valid_loss_min = np.Inf, start_epoch=1, train_loss = [], valid_loss = []):
    """
    model training
    """
     
    time_start = time.time()
    best_epoch = start_epoch
    
    for epoch in range(start_epoch, start_epoch + n_epochs):
        
        time_start_epoch = time.time()  
        
        # train current epoch
        model, train_loss_epoch = train_epoch(model,loaders["train"],optimizer,criterion,device) 
        train_loss.append(train_loss_epoch)   
        
        # validate current epoch
        valid_loss_epoch = valid_epoch(model,loaders["val"],criterion,device,fivecrop)
        
        # learning rate scheduler
        if lr_scheduler is not None:
            lr_scheduler.step(valid_loss_epoch)
        valid_loss.append(valid_loss_epoch)  
        
        is_best = False
        
        if valid_loss_epoch <= valid_loss_min: # save if validation loss is the lowest so far
            torch.save(model.state_dict(), path_model)
            valid_loss_min = valid_loss_epoch 
            best_epoch = epoch
            is_best = True
            
        # print epoch stats
        currentDT = datetime.datetime.now()
        exact_time =  str(currentDT.hour) + ":" + str(currentDT.minute) + ":" + str(currentDT.second)
        print('Epoch {} done in {:.2f} seconds at {}. \tTraining Loss: {:.3f} \tValidation Loss: {:.3f}'.format( 
            epoch,             
            time.time() - time_start_epoch,
            exact_time,
            train_loss_epoch,
            valid_loss_epoch
            ))   
        
        #save the best model status for resuming training
        model_status = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(), 
                'train_loss': train_loss,
                'valid_loss': valid_loss,
                'train_loss_min': min(train_loss),
                'valid_loss_min': valid_loss_min,
                'is_best': is_best}

        if lr_scheduler is not None:
            model_status['scheduler'] = lr_schedulis_best.state_dict()
        
        if is_best:
            print("Best validation results so far - saving best model in {}".format(path_model + "_best.tar"))
            torch.save(model_status, path_model + ".tar") #saving resume training model data
            torch.save(model_status, path_model + "_best.tar") #saving best model data
        else:
            if train_loss_epoch <= min(train_loss):
                torch.save(model_status, path_model + ".tar") #saving resume training model data
            else:
                print("Last epoch was not saved in {} since both validation and training loss didn't improve.")
            
        
    # print final statistics    
    print(f"{n_epochs} epochs trained in {(time.time() - time_start):.3f} seconds. ") 
    
    print("Best model obtained at epoch {} with minimum validation loss : {:.3f}".format(best_epoch, valid_loss_min)) 
    
    # Load best config
    model.load_state_dict(torch.load(path_model))
    
    return model

> ## Training

In [11]:
# train the model
n_epochs = 13

nc = 101

model_resumed = train(n_epochs, 
                       loaders_transfer, 
                       model_retr, 
                       optimizer_transfer, 
                       criterion_transfer, 
                       device, 
                       'models/model_res_{}cat.pt'.format(nc), 
                       fivecrop = "mean", 
                       lr_scheduler = scheduler_transfer,
                       valid_loss_min = losslogger,
                       start_epoch = start_epoch + 1,
                     train_loss=train_loss_tracker,
                     valid_loss=valid_loss_tracker)

Epoch 83 done in 3232.67 seconds at 23:32:56. 	Training Loss: 1.806 	Validation Loss: 1.303
Epoch 84 done in 3286.95 seconds at 0:27:43. 	Training Loss: 1.812 	Validation Loss: 1.304
Epoch 85 done in 3321.82 seconds at 1:23:5. 	Training Loss: 1.807 	Validation Loss: 1.304
Epoch 86 done in 3337.44 seconds at 2:18:42. 	Training Loss: 1.791 	Validation Loss: 1.303
Epoch 87 done in 3360.27 seconds at 3:14:43. 	Training Loss: 1.803 	Validation Loss: 1.309
Epoch 88 done in 3285.09 seconds at 4:9:28. 	Training Loss: 1.798 	Validation Loss: 1.297
Epoch 89 done in 3255.10 seconds at 5:3:43. 	Training Loss: 1.802 	Validation Loss: 1.293
Epoch    90: reducing learning rate of group 0 to 7.5000e-05.
Epoch 90 done in 3205.93 seconds at 5:57:9. 	Training Loss: 1.804 	Validation Loss: 1.291
Epoch 91 done in 3121.26 seconds at 6:49:10. 	Training Loss: 1.779 	Validation Loss: 1.280
Epoch 92 done in 3237.66 seconds at 7:43:8. 	Training Loss: 1.777 	Validation Loss: 1.288
Epoch 93 done in 2990.92 seconds

# 