# Assignment 04: Gradio App and Car Image Viewpoint Prediction

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torchvision.models as models
import os
from PIL import Image
from tqdm import tqdm, trange
import numpy as np
import matplotlib.pyplot as plt
import copy
import time
import random
import csv
import json
import pandas as pd
from datetime import datetime

## 0. Setting global parameters
In the beginning of the training notebook, I set the global parameters that are needed for the creation of the train, validation and test datasets and the model process. Because I tested in different environments, I set different parameters for saving the model and metrics depending on the used environment.

In [2]:
# global variables set here. the paths depend on where the file is executed
SEED = 4242
torch.manual_seed(SEED)
BATCH_SIZE = 128
ENV = 'jupyterhub'
RETRAIN_MODELS = True
DATE = datetime.today().strftime('%Y-%m-%d').replace("-", "")
BALANCED = True
SAMPLE_SIZE = 50000
DATA_SPLIT = (0.7, 0.2, 0.1)
NUM_EPOCHS = 10

if ENV == 'jupyterhub':

    ROOT = "./../data"
    MDL_SAVE = "./../models/"

elif ENV == 'local':

    ROOT = "./../../data/"
    MDL_SAVE = "./models/"

elif ENV == 'colab':

    from google.colab import drive
    drive.mount('/content/gdrive')

    ROOT = "./confirmed_fronts"
    MDL_SAVE = "./gdrive/MyDrive/DS405B/assignments/assignment_03/models/"

## 1. Dataset class and other function definitions
I use a slightly different class definition than in the last assignment. Instead of making the dataset fetch *all* filepaths in a given root directory, I pass filepaths to each dataset instance from which the samples will be created. This is done to enable subsampling from the whole population to shorten the training process. The data set class only allows for a binary prediction: full frontal view or not full frontal view. This is done because the bodytype and modernity score models were trained on a dataset of *only* full frontal images of cars. 

In [3]:
class DMV_CAR(Dataset):
    """Class representing the DMV_CAR datasat that inherits from Dataset class"""

    def __init__(self, filepaths, label_map, transforms=None, sample=None):
        """
        Arguments:
            root_dir (string): Directory with all the images.
            label_mapping (function): Function that maps the image path to the label.
            transforms (callable, optional): Optional transform to be applied on a sample.
        """

        self.filepaths = filepaths
        self.transforms = transforms
        self.label_map = label_map

    # retuns the size of the dataset
    def __len__(self):
        return len(self.filepaths)

    # function to ensure that indexing can be used for the dataset
    def __getitem__(self, idx) -> tuple:

        image_path = self.filepaths[idx]
        deg = int(self.label_map[image_path.split('/')[-1]])
        label = 1 if deg == 0 else 0
        image = Image.open(image_path).convert("RGB")

        if self.transforms is not None:
            image = self.transforms(image)

        return image, label

Like in the last assignment, I use a function to create a dictionary mapping a column to another column. This is done because dictionaries are fairly fast and the labels should be accessed as quickly as possible.

In [4]:
def create_viewpoint_dict(csv_file):
    result_dict = {}

    with open(csv_file, 'r') as file:
        csv_reader = csv.reader(file)

        for row in csv_reader:
            if len(row) >= 3:  # Ensure the row has at least three columns
                key = row[2]
                value = row[3]
                result_dict[key] = value

    return result_dict

viewpoint_image_map = create_viewpoint_dict(os.path.join(ROOT, 'Image_table.csv'))

To adapt to the changed data set class definition, I create a function that samples filepaths from a given directory. Additional to the root directory, the data split, the label map it accepts to other parameters: the sample size and a boolean parameter indicating whether the sampled data should be balanced or not. This is done because full frontal (0°) car images are a minority in the whole dataset. By balancing during the subsampling process, I ensure that the model sees equally many positive as negative classes.

In [5]:
def create_filepaths(root_dir, data_split, label_map, sample_size, balanced):

    if sample_size is None:

        # create filepaths from root directory
        filepaths = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(root_dir) for f in filenames if '$$' in f]
        filepaths = [f for f in filepaths if f.split('/')[-1] in label_map.keys()] # remove unlabeled items

        # sample train, test and validation data
        train  = random.sample(filepaths, round(len(filepaths)*data_split[0]))
        val  = random.sample(filepaths, round(len(filepaths)*data_split[1]))
        test  = random.sample(filepaths, round(len(filepaths)*data_split[2]))
        return train, val, test
    
    else:

        # create filepaths from root directory
        filepaths = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(root_dir) for f in filenames if '$$' in f]
        filepaths = [f for f in filepaths if f.split('/')[-1] in label_map.keys()] # remove unlabeled items

        if balanced:

            pos = [x for x in filepaths if label_map[x.split('/')[-1]] == "0"] 
            neg = [x for x in filepaths if label_map[x.split('/')[-1]] != "0"]

            train = ([pos.pop(random.randrange(len(pos))) for _ in range(round(data_split[0]*0.5*sample_size))] + 
                     [neg.pop(random.randrange(len(neg))) for _ in range(round(data_split[0]*0.5*sample_size))])
            
            val = ([pos.pop(random.randrange(len(pos))) for _ in range(round(data_split[1]*0.5*sample_size))] + 
                     [neg.pop(random.randrange(len(neg))) for _ in range(round(data_split[1]*0.5*sample_size))])
            
            test = ([pos.pop(random.randrange(len(pos))) for _ in range(round(data_split[2]*0.5*sample_size))] + 
                     [neg.pop(random.randrange(len(neg))) for _ in range(round(data_split[2]*0.5*sample_size))])

            return train, val, test

        else:
            
            train = [filepaths.pop(random.randrange(len(filepaths))) for _ in range(round(data_split[0]*sample_size))]
            val = [filepaths.pop(random.randrange(len(filepaths))) for _ in range(round(data_split[1]*sample_size))]
            test = [filepaths.pop(random.randrange(len(filepaths))) for _ in range(round(data_split[2]*sample_size))]
            
            return train, val, test

## 2. Dataset creation
I use the ImageNet image mean and standard deviation for the transformation of the training, validation and test sets. The training set also is exposed to some random perturbations. I create the filepaths for the three data sets using the custom function. Before proceeding with the instantation of the DMV_CAR objects, I make sure that there is no data leakage between the individual sets.

In [7]:
# build the transforms using characteristics of pretrained dataset (ImageNet)
pretrained_size = 224
imagenet_mean, imagenet_std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]

# training transforms
train_transforms = transforms.Compose([
                           transforms.Resize(pretrained_size),
                           transforms.RandomRotation(5),
                           transforms.RandomHorizontalFlip(0.5),
                           transforms.RandomCrop(pretrained_size, padding=10),
                           transforms.ToTensor(),
                           transforms.Normalize(mean=imagenet_mean,
                                                std=imagenet_std)
                       ])

# test transforms
test_transforms = transforms.Compose([
                           transforms.Resize(pretrained_size),
                           transforms.ToTensor(),
                           transforms.Normalize(mean=imagenet_mean,
                                                std=imagenet_std)
                       ])

In [8]:
train_paths, val_paths, test_paths = create_filepaths(ROOT, DATA_SPLIT, viewpoint_image_map, SAMPLE_SIZE, BALANCED)

# make sure there is no data leakage in the creation of the data sets
assert (len(list(set(val_paths).intersection(test_paths)))) == (len(list(set(train_paths).intersection(test_paths)))) == (len(list(set(train_paths).intersection(val_paths)))) == 0

train_dataset = DMV_CAR(filepaths = train_paths,
                   label_map = viewpoint_image_map,
                   transforms = train_transforms
                   )

val_dataset = DMV_CAR(filepaths = val_paths,
                   label_map = viewpoint_image_map,
                   transforms = test_transforms
                   )

test_dataset = DMV_CAR(filepaths = test_paths,
                   label_map = viewpoint_image_map,
                   transforms = test_transforms
                   )

# create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("Number of training examples:", len(train_paths))
print("Number of validation examples:", len(val_paths))
print("Number of testing examples:", len(test_paths))

Number of training examples: 35000
Number of validation examples: 10000
Number of testing examples: 5000


## 3. Definition of model and necessary helper functions
I use the same functions as in the last assignment and adapt them to the new data. The model used for transfer learning is again the ResNet18 model. I proceed by freezing all layers, then replace the last fully connected layer to match the binary classification task. I unfreeze the parameters of the layer4 and the fully connected layer. The model learns the viewpoint as an output of a vector with two entries, one for each class. For this reason, I used cross entropy loss as the criterion. One could have also achieved the classification task as a regression and used binary cross entropy.

In [9]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def train(model, iterator, optimizer, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for (x, y) in tqdm(iterator, desc="Training", leave=False):

        x = x.to(device)
        y = y.to(device)
        y = y.squeeze(-1)

        optimizer.zero_grad()

        y_pred = model(x)

        loss = criterion(y_pred, y)

        acc = calculate_accuracy(y_pred, y)

        loss.backward()
        optimizer.step()

        # perform backward pass and update weights
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, device):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):

            x = x.to(device)
            y = y.to(device)

            #y = y.squeeze(-1)

            y_pred = model(x)
            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [10]:
# get model architecture
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

# Freeze all the layers except the last one
for param in model.parameters():
    param.requires_grad = False
    
# replace last layer
model.fc = torch.nn.Linear(model.fc.in_features, 2)
    
for param in model.fc.parameters():
    param.requires_grad = True
for param in model.avgpool.parameters():
    param.requires_grad = True
for param in model.layer4.parameters():
    param.requires_grad = True

# check if cuda is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# move model to device
model = model.to(device)

# define loss function
criterion = torch.nn.CrossEntropyLoss()

is_bal = "bal" if BALANCED else "unbal"

# some filenames under which the models are saved / will be saved
filenames = {
    'model': f'viewpoints_{SAMPLE_SIZE}_{DATE}_{is_bal}_model.pt',
    'optim': f'viewpoints_{SAMPLE_SIZE}_{DATE}_{is_bal}_optim.pt',
    'metrics': f'viewpoints_{SAMPLE_SIZE}_{DATE}_{is_bal}.json'
}

In [11]:
# print free parameters of model
print(f'The model has {count_parameters(model):,} trainable parameters.\n')

# check that model architecture is correct
print("Model architecture: \n", model)

The model has 8,394,754 trainable parameters.

Model architecture: 
 ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affin

## 4. Model training
During testing with smaller sample sizes I got the feeling that the model is able to learn the (relatively easy) binary classification task quite fast and converges quickly. For this reason, I set the number of epochs to only 10 and specified a patience parameter for early stopping of 2. The Adam optimizer is used to adjust the weights. 

In [13]:
metrics = {
    "train_loss": [],
    "train_acc": [],
    "val_loss": [],
    "val_acc": [],
}

patience = 2
early_stopping_counter = 0

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

best_valid_loss = float('inf')

for epoch in trange(NUM_EPOCHS, desc="Epochs"):
    print("Current epoch: {}".format(epoch))

    start_time = time.monotonic()

    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, val_loader, criterion, device)

    metrics['train_loss'].append(train_loss)
    metrics['train_acc'].append(train_acc)
    metrics['val_loss'].append(valid_loss)
    metrics['val_acc'].append(valid_acc)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MDL_SAVE + filenames['model'])
        torch.save(optimizer.state_dict(), MDL_SAVE + filenames['optim']) 

    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    # implement early stopping
    if (epoch > 0) & (valid_loss > metrics['val_loss'][epoch-1]):
        early_stopping_counter += 1
    else:
        early_stopping_counter
    
    if early_stopping_counter >= patience:
        print("Early stopping")
        break
        
# dump metrics to JSON
with open(MDL_SAVE + filenames['metrics'], 'w') as fp:
    json.dump(metrics, fp)

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Current epoch: 0



Training:   0%|          | 0/274 [00:00<?, ?it/s][A
Training:   0%|          | 1/274 [00:00<03:18,  1.38it/s][A
Training:   1%|          | 2/274 [00:01<02:32,  1.78it/s][A
Training:   1%|          | 3/274 [00:01<02:17,  1.97it/s][A
Training:   1%|▏         | 4/274 [00:02<02:10,  2.06it/s][A
Training:   2%|▏         | 5/274 [00:02<02:06,  2.13it/s][A
Training:   2%|▏         | 6/274 [00:02<02:05,  2.14it/s][A
Training:   3%|▎         | 7/274 [00:03<02:02,  2.17it/s][A
Training:   3%|▎         | 8/274 [00:03<02:01,  2.19it/s][A
Training:   3%|▎         | 9/274 [00:04<02:00,  2.21it/s][A
Training:   4%|▎         | 10/274 [00:04<01:59,  2.21it/s][A
Training:   4%|▍         | 11/274 [00:05<01:58,  2.21it/s][A
Training:   4%|▍         | 12/274 [00:05<01:58,  2.21it/s][A
Training:   5%|▍         | 13/274 [00:06<01:58,  2.20it/s][A
Training:   5%|▌         | 14/274 [00:06<01:57,  2.21it/s][A
Training:   5%|▌         | 15/274 [00:07<01:56,  2.22it/s][A
Training:   6%|▌         

Epoch: 01 | Epoch Time: 2m 34s
	Train Loss: 0.134 | Train Acc: 95.20%
	 Val. Loss: 0.132 |  Val. Acc: 95.09%
Current epoch: 1



Training:   0%|          | 0/274 [00:00<?, ?it/s][A
Training:   0%|          | 1/274 [00:00<02:02,  2.22it/s][A
Training:   1%|          | 2/274 [00:01<02:20,  1.94it/s][A
Training:   1%|          | 3/274 [00:01<02:11,  2.06it/s][A
Training:   1%|▏         | 4/274 [00:01<02:06,  2.13it/s][A
Training:   2%|▏         | 5/274 [00:02<02:04,  2.17it/s][A
Training:   2%|▏         | 6/274 [00:02<02:02,  2.19it/s][A
Training:   3%|▎         | 7/274 [00:03<02:01,  2.20it/s][A
Training:   3%|▎         | 8/274 [00:03<02:00,  2.21it/s][A
Training:   3%|▎         | 9/274 [00:04<01:59,  2.22it/s][A
Training:   4%|▎         | 10/274 [00:04<01:57,  2.24it/s][A
Training:   4%|▍         | 11/274 [00:05<01:57,  2.23it/s][A
Training:   4%|▍         | 12/274 [00:05<01:57,  2.23it/s][A
Training:   5%|▍         | 13/274 [00:05<01:57,  2.22it/s][A
Training:   5%|▌         | 14/274 [00:06<02:03,  2.11it/s][A
Training:   5%|▌         | 15/274 [00:06<02:00,  2.15it/s][A
Training:   6%|▌         

Epoch: 02 | Epoch Time: 2m 30s
	Train Loss: 0.120 | Train Acc: 95.50%
	 Val. Loss: 0.134 |  Val. Acc: 95.17%
Current epoch: 2



Training:   0%|          | 0/274 [00:00<?, ?it/s][A
Training:   0%|          | 1/274 [00:00<02:00,  2.26it/s][A
Training:   1%|          | 2/274 [00:00<02:00,  2.27it/s][A
Training:   1%|          | 3/274 [00:01<01:59,  2.26it/s][A
Training:   1%|▏         | 4/274 [00:01<01:58,  2.27it/s][A
Training:   2%|▏         | 5/274 [00:02<01:58,  2.28it/s][A
Training:   2%|▏         | 6/274 [00:02<01:57,  2.28it/s][A
Training:   3%|▎         | 7/274 [00:03<01:57,  2.27it/s][A
Training:   3%|▎         | 8/274 [00:03<01:58,  2.25it/s][A
Training:   3%|▎         | 9/274 [00:03<01:56,  2.27it/s][A
Training:   4%|▎         | 10/274 [00:04<01:55,  2.28it/s][A
Training:   4%|▍         | 11/274 [00:04<01:54,  2.29it/s][A
Training:   4%|▍         | 12/274 [00:05<01:54,  2.29it/s][A
Training:   5%|▍         | 13/274 [00:05<01:53,  2.29it/s][A
Training:   5%|▌         | 14/274 [00:06<01:53,  2.30it/s][A
Training:   5%|▌         | 15/274 [00:06<01:52,  2.29it/s][A
Training:   6%|▌         

Epoch: 03 | Epoch Time: 2m 29s
	Train Loss: 0.108 | Train Acc: 95.93%
	 Val. Loss: 0.144 |  Val. Acc: 95.06%
Early stopping





## 4. Evaluation
Evaluation on the test set delivers a satisfactory test accuracy of 94.7%. This is slightly lower than the train and validation accuracies during the last epoch, but should be good enough to generalize to other images that will be uploaded to the app. 

In [14]:
def evaluate_model(model, iterator, device, loss):

    test_acc = 0
    test_loss = 0

    model.eval()

    for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):

        x = x.to(device)
        y = y.to(device)

        y = y.squeeze(-1)

        y_pred = model(x)
        loss = criterion(y_pred, y)

        acc = calculate_accuracy(y_pred, y)

        test_acc += acc.item()
        test_loss += loss.item()

    test_acc /= len(test_loader)
    test_loss /= len(test_loader)

    return test_acc, test_loss

In [15]:
print("Test accuracy: ", evaluate_model(model, test_loader, device, criterion)[0])

                                                           

Test accuracy:  0.9474609375




## 6. Gradio App
The app is located under [this link](https://huggingface.co/spaces/felix-g-k/PDL_ASS_IV).