## Data Preprocessing Pipeline

Data is hierarchically organized as follows: 'root/make_id/model_id/released_year/image_name.jpg'. Root is the 'image' folder of the CompCars dataset.

In [4]:
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import Subset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.model_selection import train_test_split

# custom dataset for hierarchical class structure from dataset.py
from dataset import CompCarsImageFolder

# set root to the image folder of CompCars dataset
root = 'data/image'  # TODO: ADAPT TO YOUR FOLDER STRUCTURE

root = '../cars_data/data/image'

# TODO: Adapt transforms to our data set
# TODO: maybe use v2 transforms: https://pytorch.org/vision/stable/transforms.html
data_transforms = {
        'train': transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # TODO: find normalization for CompCars dataset
        ]),
        'val': transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # TODO: find normalization for CompCars dataset
        ])
}

In [5]:
class WrapperDataset:
    def __init__(self, dataset, transform=None, target_transform=None):
        self.dataset = dataset
        self.transform = transform
        self.target_transform = target_transform

    def __getitem__(self, index):
        image, label = self.dataset[index]
        if self.transform is not None:
            image = self.transform(image)
        if self.target_transform is not None:
            label = self.target_transform(label)
        return image, label

    def __len__(self):
        return len(self.dataset)

def train_val_dataset(dataset, val_split=0.25):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['val'] = Subset(dataset, val_idx)
    return datasets

### Save total CompCars dataset in a DataFolder class

In [10]:
# hierarchy=0 -> manufacturer classification; hierarchy=1 -> model classification
total_set = CompCarsImageFolder(root, hierarchy=1)  # Adjust hierarchy as needed
print(total_set.classes)
print(len(total_set.classes))

['1/1101', '1/1102', '1/1103', '1/1104', '1/1105', '1/1106', '1/1107', '1/1108', '1/1109', '1/1110', '1/1112', '1/1113', '10/1309', '100/209', '100/210', '100/211', '100/212', '100/213', '100/214', '100/215', '100/216', '100/217', '100/218', '100/219', '100/221', '100/222', '100/223', '100/226', '100/227', '100/228', '100/229', '100/230', '100/231', '100/232', '100/233', '100/234', '100/235', '100/236', '100/239', '100/240', '100/241', '100/242', '100/243', '100/247', '100/248', '100/249', '100/250', '100/251', '100/252', '101/816', '101/817', '101/818', '101/819', '101/820', '101/821', '102/253', '102/254', '102/255', '102/256', '102/257', '102/258', '102/259', '102/260', '102/261', '102/262', '102/263', '102/265', '102/266', '102/267', '102/268', '102/269', '102/271', '102/272', '102/273', '102/280', '102/281', '102/282', '102/283', '102/284', '102/285', '102/286', '102/287', '102/288', '102/289', '102/290', '102/291', '102/292', '102/293', '103/1842', '103/1843', '103/1844', '103/18

### Split in training and validation data

In [11]:

datasets = train_val_dataset(total_set)

wrapped_datasets = {
    'train': WrapperDataset(datasets['train'], transform=data_transforms['train']),
    'val': WrapperDataset(datasets['val'], transform=data_transforms['val'])
}

dataloaders = {
    'train': DataLoader(wrapped_datasets['train'], batch_size=32, shuffle=True, num_workers=4),
    'val': DataLoader(wrapped_datasets['val'], batch_size=32, shuffle=True, num_workers=4)
}

print(f"Total dataset size: {len(total_set)}")
print(f"Training dataset size: {len(datasets['train'])}")
print(f"Validation dataset size: {len(datasets['val'])}")

x, y = next(iter(dataloaders['train']))
print(f"Batch of images shape: {x.shape}")
print(f"Batch of labels shape: {y.shape}")


Total dataset size: 136726
Training dataset size: 102544
Validation dataset size: 34182
Batch of images shape: torch.Size([32, 3, 224, 224])
Batch of labels shape: torch.Size([32])
