****## Split dataset into training and validation (Pytorch)

In this notebook I'm showing how to split validation set from training set that will have reproducible results.
I found out that not everyone knows how to do this

This will be all the library used in this notebook.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision import datasets



from torch.utils.data import Dataset

from torch.utils import data as D

import time
import os
import PIL.Image as Image
from IPython.display import display

import random

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)






cuda:0


This will be all the function used in this notebook.

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

class ImageFolderWithPaths(datasets.ImageFolder):
    """Custom dataset that includes image file paths. Extends
    torchvision.datasets.ImageFolder
    """

    # override the __getitem__ method. this is the method dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,))
        return tuple_with_path
    
    
'''
This part train resnet model with default 10 epoch.
'''



def train_model(model, criterion, optimizer, scheduler, n_epochs = 10):
    
    losses = []
    accuracies = []
    test_accuracies = []
    tmp_acc = 0.0
    # set the model to train mode initially
    model.train()
    for epoch in range(n_epochs):
        since = time.time()
        running_loss = 0.0
        running_correct = 0.0
        for i, data in enumerate(trainloader, 0):

            # get the inputs and assign them to cuda
            inputs, labels, _ = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            
            # forward + backward + optimize
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            # calculate the loss/acc later
            running_loss += loss.item()
            running_correct += (labels==predicted).sum().item()

        epoch_duration = time.time()-since
        epoch_loss = running_loss/len(trainloader)
        epoch_acc = 100/32*running_correct/len(trainloader)
        print("Epoch %s, duration: %d s, loss: %.4f, acc: %.4f" % (epoch+1, epoch_duration, epoch_loss, epoch_acc))
        
        losses.append(epoch_loss)
        accuracies.append(epoch_acc)
        
        # switch the model to eval mode to evaluate on test data
        model.eval()
        test_acc = evaluate_model(model)
        test_accuracies.append(test_acc)
        
        '''
        This will save the best model in training.
        '''
        if test_acc > tmp_acc:
            tmp_acc = test_acc
            torch.save(model, "../working/model.pth")
        
        # re-set the model to train mode after validating
        model.train()
        scheduler.step(test_acc)
        since = time.time()
        
    model.eval()
    get_predict(model)
    print('Finished Training')
    model.train()
    return model, losses, accuracies, test_accuracies

'''
This part evaluation the model every epoch with validation dataset.
'''
def evaluate_model(model):
    correct = 0.0
    total = 0.0
    
    with torch.no_grad():
        for i, data in enumerate(validloader, 0):
            images, labels, _ = data
            #images = images.to(device).half() # uncomment for half precision model
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model_ft(images)
            _, predicted = torch.max(outputs.data, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    test_acc = 100.0 * correct / total
    print('Accuracy of the network on the validation images: %.2f %%' % (
        test_acc))
    return test_acc

'''
This part will get the prediction result from the model.
The result include confidence result and also the best confidence result.
'''

def get_predict(model):
    correct = 0.0
    total = 0.0
    
    with torch.no_grad():
        for i, data in enumerate(testloader, 0):
            images, labels, fnames = data
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model_ft(images)
            _, predicted = torch.max(outputs.data, 1)
            
            tmp_predict = outputs.data
            predicted2 = torch.nn.functional.softmax(tmp_predict)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    test_acc = 100.0 * correct / total
    print('Accuracy of the network on the testing images: %.2f %%' % (
        test_acc))


## Load the data and transform

First, lets create some transforms for our data and load the train/validation and test data with labels from the folders.

Here I use 400 x 400 images with random horizontal flip, random rotation and normalization.

I used 400 x 400 due to the limitation of computer resources. 

400 x 400 is the maximum resolution that I can run.

In [3]:
'''
Manual seed everything to get reproducible results
'''
SEED = 123
seed_everything(SEED)

'''
This is the dataset directory for my Kaggle kernel. 
Please comment this line and uncomment the following line if you run it on your workstation.
'''
dataset_dir = "../input/car_data/car_data/"


train_tfms = transforms.Compose([transforms.Resize((400, 400)),
                                 transforms.RandomHorizontalFlip(),
                                 transforms.RandomRotation(15),
                                 transforms.ToTensor(),
                                 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
test_tfms = transforms.Compose([transforms.Resize((400, 400)),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

dataset = ImageFolderWithPaths(root=dataset_dir+"train", transform = train_tfms)

'''
This is the part that will split validation set from training set.
This part I load image from folder and then split 90% for training and 10% for validation.
The label will be the sub folder of each image.
'''
train_len = int(0.9 * 8144)
valid_len = 8144 - train_len

#Pytorch had provided a function called random_split but this will only randomly split dataset without reproducible results
#There is no seed argument for this function so I had to manual set seed 

train_dataset, valid_dataset = D.random_split(dataset, lengths=[train_len, valid_len])

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size = 32, shuffle=True, num_workers = 2)

validloader = torch.utils.data.DataLoader(valid_dataset, batch_size = 32, shuffle=True, num_workers = 2)

test_dataset = ImageFolderWithPaths(root=dataset_dir+"test", transform = test_tfms)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size = 32, shuffle=False, num_workers = 2)


## Model training function

Here I start to train my model with validation datasets.

In [4]:
'''
This will download pretrained torch model from internet.
'''
model_ft = models.resnet50(pretrained=True)


num_ftrs = model_ft.fc.in_features

# replace the last fc layer with an untrained one (requires grad by default)
model_ft.fc = nn.Linear(num_ftrs, 196)
model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model_ft.parameters(), lr=0.01, momentum=0.9)

"""
probably not the best metric to track, but we are tracking the training accuracy and measuring whether
it increases by atleast 0.9 per epoch and if it hasn't increased by 0.9 reduce the lr by 0.1x.
However in this model it did not benefit me.
"""
lrscheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, threshold = 0.9)

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /tmp/.cache/torch/checkpoints/resnet50-19c8e357.pth
100%|██████████| 102502400/102502400 [00:00<00:00, 103294985.02it/s]


In [5]:
model_ft, training_losses, training_accs, test_accs = train_model(model_ft, criterion, optimizer, lrscheduler, n_epochs=1)

Epoch 1, duration: 153 s, loss: 4.2294, acc: 12.6766
Accuracy of the network on the validation images: 26.13 %




Accuracy of the network on the testing images: 27.35 %
Finished Training
