# CV-703, Assignment 1

## Libraries

In [26]:
import numpy as np
import math

from tqdm import tqdm

import PIL
from PIL import Image

import torch
import torch.optim as optim
import torchvision
from torchvision import transforms
import torchvision.transforms as T

import pandas as pd

from models_to_finetune import deit_small_patch16_224

from datasets import CUBDataset, DOGDataset, FOODDataset



# from __future__ import print_function, division

# import numpy as np

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# from torch.optim import lr_scheduler

# import torchvision
# from torchvision import datasets, models, transforms
# import torchvision.transforms as T

# from PIL import Image

# import matplotlib.pyplot as plt

# import pandas as pd

# import scipy.io #for dogs dateset

# import time
# import os
# import copy

In [27]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## Setup a dataset
Uncomment that one dataset you need

### CUB-200-2011 (Birds) Dataset

In [None]:
classes_number = 200


In [28]:
data_root = "/apps/local/shared/CV703/datasets/CUB/CUB_200_2011/"

mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)


# Write data transform here as per the requirement
data_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)
    ])

train_dataset = CUBDataset(image_root_path=f"{data_root}", transform=data_transform, split="train")
test_dataset = CUBDataset(image_root_path=f"{data_root}", transform=data_transform, split="test")
print('Number of train samples:', len(train_dataset))
print('Number of test samples:', len(test_dataset))


# Load in into the torch dataloader to get variable batch size, shuffle 
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, drop_last=True, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, drop_last=False, shuffle=True)



Number of train samples: 5994
Number of test samples: 5794


In [29]:
len(train_loader), len(test_loader)

(187, 182)

In [30]:
for i, (inputs, labels) in enumerate(train_loader):
    print(inputs.shape)
    print(labels)
    print('='*50)
    break

torch.Size([32, 3, 224, 224])
tensor([ 85,  49, 163,  26, 170, 104, 171,  13, 160,  60,  50, 166, 121,  46,
        144, 106, 102,   6, 107, 141,  28,   9,  77,  17,  58, 108, 187, 190,
         80,  64,  88, 144])


### Stanford Dogs Dataset

In [None]:
classes_number = 120

In [31]:
data_root = "/apps/local/shared/CV703/datasets/dog/"


mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)

data_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=std)
    ])


train_dataset = DOGDataset(image_root_path=f"{data_root}", transform=data_transform, split="train")
test_dataset = DOGDataset(image_root_path=f"{data_root}", transform=data_transform, split="test")
print('Number of train samples:', len(train_dataset))
print('Number of test samples:', len(test_dataset))

# Load in into the torch dataloader to get variable batch size, shuffle 
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, drop_last=True, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, drop_last=False, shuffle=True)

Number of train samples: 12000
Number of test samples: 8580


In [32]:
len(train_loader), len(test_loader)

(375, 269)

In [33]:
for i, (inputs, labels) in enumerate(test_loader):
    print(inputs.shape)
    print(labels)
    print('='*50)
    break

torch.Size([32, 3, 224, 224])
tensor([ 96,  24,   3,  86,  13,  40,  34, 105, 108,  80,  46,  43,  27,  92,
        111,  45,  36,  80,  63, 105,  73,  90,  45,   9,  40,  32,  44,  31,
         23,  99,  32,  57])


### CUB-200-2011 + Stanford Dog (concatenated) Dataset

In [77]:
classes_number = 320

In [44]:
# CUB:
data_root_bird = "/apps/local/shared/CV703/datasets/CUB/CUB_200_2011/"

mean_bird = (0.485, 0.456, 0.406)
std_bird = (0.229, 0.224, 0.225)


# write data transform here as per the requirement
data_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean_bird, std=std_bird)
    ])

train_dataset_cub = CUBDataset(image_root_path=f"{data_root_bird}", transform=data_transform, split="train")
test_dataset_cub = CUBDataset(image_root_path=f"{data_root_bird}", transform=data_transform, split="test")
print('Number of train samples:', len(train_dataset_cub))
print('Number of test samples:', len(test_dataset_cub))

Number of train samples: 5994
Number of test samples: 5794


In [45]:
# Dog:
mean_dog = (0.485, 0.456, 0.406)
std_dog = (0.229, 0.224, 0.225)

data_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean_dog, std=std_dog)
    ])


data_root_dog = "/apps/local/shared/CV703/datasets/dog/"

# TODO: Start labels counting from 200 (in case of concatenation only)!
train_dataset_dog = DOGDataset(image_root_path=f"{data_root_dog}", transform=data_transform, split="train")
test_dataset_dog = DOGDataset(image_root_path=f"{data_root_dog}", transform=data_transform, split="test")
print('Number of train samples:', len(train_dataset_dog))
print('Number of test samples:', len(test_dataset_dog))

Number of train samples: 12000
Number of test samples: 8580


In [46]:
# concatenated dataloader for CUB and DOG

train_loader = torch.utils.data.DataLoader(
             torch.utils.data.ConcatDataset([train_dataset_cub, train_dataset_dog]),
             batch_size=32, shuffle=True,
             num_workers=1, pin_memory=True)

test_loader = torch.utils.data.DataLoader(
             torch.utils.data.ConcatDataset([test_dataset_cub, test_dataset_dog]),
             batch_size=32, shuffle=True,
             num_workers=1, pin_memory=True)

In [47]:
len(train_dataset_cub), len(train_dataset_dog), len(train_loader)

(5994, 12000, 17994)

In [48]:
len(test_dataset_cub), len(test_dataset_dog), len(test_loader)

(5794, 8580, 14374)

In [49]:
for i, (inputs, targets) in enumerate(train_loader):

    print('image :: ', inputs.shape)
    print(targets)
    break

image ::  torch.Size([1, 3, 224, 224])
tensor([35])


### FoodX-251 Dataset

In [81]:
classes_number = 251

In [82]:
#ds_type = "local" # comment out if using ds from the shared folder
ds_type = "shared" # comment out if using ds from the local folder


if (ds_type == "local"):
    data_dir = "/home/u20020067/Documents/Datasets/FoodX-251"

    split = 'train'
    train_df = pd.read_csv(f'{data_dir}/annot/{split}_info.csv', names= ['image_name','label'])
    train_df['path'] = train_df['image_name'].map(lambda x: os.path.join(f'{data_dir}/{split}/{split}_set/', x))

    split = 'val'
    test_df = pd.read_csv(f'{data_dir}/annot/{split}_info.csv', names= ['image_name','label'])
    test_df['path'] = test_df['image_name'].map(lambda x: os.path.join(f'{data_dir}/{split}/{split}_set/', x))

elif (ds_type == "shared"):
    data_dir = "/apps/local/shared/CV703/datasets/FoodX/food_dataset"

    split = 'train'
    train_df = pd.read_csv(f'{data_dir}/annot/{split}_info.csv', names= ['image_name','label'])
    train_df['path'] = train_df['image_name'].map(lambda x: os.path.join(f'{data_dir}/{split}_set/', x))

    split = 'val'
    test_df = pd.read_csv(f'{data_dir}/annot/{split}_info.csv', names= ['image_name','label'])
    test_df['path'] = test_df['image_name'].map(lambda x: os.path.join(f'{data_dir}/{split}_set/', x))

else:
    print("ERROR: Choose dataset type (local/shared)!")


#test_df['path'] = test_df['image_name'].map(lambda x: os.path.join(f'./{split}/{split}_set/', x)) # original


# train_dir = '/home/u20020067/Documents/Datasets/FoodX-251/train/train_set/'
# val_dir = '/home/u20020067/Documents/Datasets/FoodX-251/val/val_set/'

# train_df = pd.read_csv('/home/u20020067/Documents/Datasets/FoodX-251/annot/train_info.csv', names= ['img_name','label'])
# train_df['path'] = train_df['img_name'].map(lambda x: os.path.join(train_dir,x))
# val_df = pd.read_csv('/home/u20020067/Documents/Datasets/FoodX-251/annot/val_info.csv', names= ['img_name','label'])
# val_df['path'] = val_df['img_name'].map(lambda x: os.path.join(val_dir,x))

In [83]:
train_dataset = FOODDataset(train_df)
test_dataset = FOODDataset(test_df)
print('Number of train samples:', len(train_dataset))
print('Number of test samples:', len(test_dataset))

# load in into the torch dataloader to get variable batch size, shuffle 
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, drop_last=True, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, drop_last=False, shuffle=True)

Number of train samples: 118475
Number of test samples: 11994


In [84]:
len(train_dataset), len(test_dataset)

(118475, 11994)

In [85]:
len(train_loader), len(test_loader)

(3702, 375)

In [86]:
for i, (inputs, labels) in enumerate(train_loader):
    print(inputs.shape)
    print(labels)
    print('='*50)

    break

torch.Size([32, 3, 224, 224])
tensor([143,  33, 205, 203, 119,  50, 242, 233,  38, 107,  59,  80, 105, 183,
         39, 109, 124,   9,  36,  94,   3, 240, 144, 161,  57,  33, 138, 136,
        134, 224, 227, 217])


### FoodX (old)

In [63]:
# # Set train and test set
# mean = (0.485, 0.456, 0.406)
# std = (0.229, 0.224, 0.225)
# data_transform = transforms.Compose([
#         transforms.Resize((224, 224)),
#         transforms.ToTensor(),
#         transforms.Normalize(mean=mean, std=std)
#     ])
# data_root = "./dog/"
# train_dataset = DOGDataset(image_root_path=f"{data_root}", transform=data_transform, split="train")
# test_dataset = DOGDataset(image_root_path=f"{data_root}", transform=data_transform, split="test")
# print('Number of train samples:', len(train_dataset))
# print('Number of test samples:', len(test_dataset))
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, drop_last=True, shuffle=True)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, drop_last=False)



#data_root = "/home/u20020067/Documents/Datasets/CUB_200_2011/CUB_200_2011/"

# mean = (0.485, 0.456, 0.406)
# std = (0.229, 0.224, 0.225)


# # write data transform here as per the requirement
# data_transform = transforms.Compose([
#         transforms.Resize((224, 224)),
#         transforms.ToTensor(),
#         transforms.Normalize(mean=mean, std=std)
#     ])

# train_dataset = CUBDataset(image_root_path=f"{data_root}", transform=data_transform, split="train")
# test_dataset = CUBDataset(image_root_path=f"{data_root}", transform=data_transform, split="test")


# # load in into the torch dataloader to get variable batch size, shuffle 
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, drop_last=True, shuffle=True)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, drop_last=False, shuffle=True)





# data_dir = "/apps/local/shared/CV703/datasets/FoodX/food_dataset/"

# split = 'train'
# train_df = pd.read_csv(f'{data_dir}/annot/{split}_info.csv', names= ['image_name','label'])
# train_df['path'] = train_df['image_name'].map(lambda x: os.path.join(f'./{split}/{split}_set/', x))


# split = 'val'
# val_df = pd.read_csv(f'{data_dir}/annot/{split}_info.csv', names= ['image_name','label'])
# val_df['path'] = val_df['image_name'].map(lambda x: os.path.join(f'./{split}/{split}_set/', x))


# train_dataset = FOODDataset(train_df)
# val_dataset = FOODDataset(val_df)

# # load in into the torch dataloader to get variable batch size, shuffle 
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, drop_last=True, shuffle=True)
# val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, drop_last=False, shuffle=True)



In [64]:
# #data_dir = "/home/u20020067/Documents/Datasets/FoodX/food_dataset"
# data_dir = "/home/u20020067/Documents/Datasets/FoodX-251"

# split = 'train'
# train_df = pd.read_csv(f'{data_dir}/annot/{split}_info.csv', names= ['image_name','label'])
# train_df['path'] = train_df['image_name'].map(lambda x: os.path.join(f'./{split}/{split}_set/', x))


# split = 'val'
# val_df = pd.read_csv(f'{data_dir}/annot/{split}_info.csv', names= ['image_name','label'])
# val_df['path'] = val_df['image_name'].map(lambda x: os.path.join(f'./{split}/{split}_set/', x))


# # train_dir = '/home/u20020067/Documents/Datasets/FoodX-251/train/train_set/'
# # val_dir = '/home/u20020067/Documents/Datasets/FoodX-251/val/val_set/'

# # train_df = pd.read_csv('/home/u20020067/Documents/Datasets/FoodX-251/annot/train_info.csv', names= ['img_name','label'])
# # train_df['path'] = train_df['img_name'].map(lambda x: os.path.join(train_dir,x))
# # val_df = pd.read_csv('/home/u20020067/Documents/Datasets/FoodX-251/annot/val_info.csv', names= ['img_name','label'])
# # val_df['path'] = val_df['img_name'].map(lambda x: os.path.join(val_dir,x))



In [65]:
# train_dataset = FOODDataset(train_df)
# val_dataset = FOODDataset(val_df)


# # load in into the torch dataloader to get variable batch size, shuffle 
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, drop_last=True, shuffle=True)
# val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, drop_last=False, shuffle=True)

# len(train_dataset), len(val_dataset)

## Prepare ViT for transfer learning

In [73]:
# we will use only the last class token (produced by the last block) for transfer learning
model = deit_small_patch16_224(pretrained=True, use_top_n_heads=4,use_patch_outputs=False).cuda()

# freeze backbone and add linear classifier on top that
for param in model.parameters():
        param.requires_grad = False
model.head = torch.nn.Linear(in_features=model.head.in_features, out_features=classes_number)

_IncompatibleKeys(missing_keys=['head.weight', 'head.bias'], unexpected_keys=[])


In [74]:
model.head.apply(model._init_weights)
for param in model.head.parameters():
    param.requires_grad = True

model = model.to(device)

In [75]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, betas=(0.5, 0.999))

## Training

In [76]:
epochs = 1
print('Training....')
for epoch in range(epochs):
    with tqdm(train_loader) as p_bar:
        for samples, targets in p_bar:
            samples = samples.to(device)
            targets = targets.to(device)
            
            outputs = model(samples, fine_tune=True)
            loss = criterion(outputs, targets)

            loss_value = loss.item()
            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                sys.exit(1)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

Training....


100%|██████████| 3702/3702 [08:42<00:00,  7.08it/s]


## Testing

In [88]:
print('Testing....')
acc=0
with tqdm(test_loader) as p_bar:
    for samples, targets in p_bar:
        samples = samples.to(device)
        targets = targets.to(device)
        
        outputs = model(samples, fine_tune=True)
        acc+=torch.sum(outputs.argmax(dim=-1) == targets).item()

print('Accuracy:{0:.3%}'.format(acc/len(test_dataset)))



Testing....


100%|██████████| 375/375 [00:52<00:00,  7.09it/s]

Accuracy:37.077%





## To Do
* Change number of training epochs
* Change number of class tokens e.g, use_top_n_heads=4, etc