# Library Imports

In [None]:
# General imports
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import ast
import cv2
import pandas as pd
# PyTorch imports
import torch
import torchvision
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms

# Prep Dataset

I made a custom dataset because the data wasn't given in a way that lended itself to just a normal PyTorch dataset. I wanted to make it as easy as possible to turn the string labels into integers so my model could work with them, and I needed a way to store all of the images in a custom manner. The custom collate function was necessary because I split shape and color. I made this decision when I decided to have two fine-tuning heads in my model (which I'll explain the process of deciding later). The custom test set was necessary because there are no labels for the test set, so I needed a way to efficiently run the test images but not deal with labels that would've existed had I used my original CustomDataset.

If I'd had test data that was labeled and I was actually trying to optimize based on the test set accuracy, I may have performed data augmentations. However, I didn't feel this was necessary for two main reasons. First, the dataset is relatively small, and accidentally over-augmenting the dataset could've really negatively impacted the model's performance, so it wasn't a risk I wanted to take. And second, I viewed some of the images and it seemed like the shapes were already fairly spread out and in a variety of orientations.

I originally started with only labels for each shape and each color, but soon realized that it would make the model's architecture much simpler if I included a "background" class. I called this class "NONE", and filled any extra labels with that class assignment, so my model would learn to predict nothing, if no shape existed.

In [3]:
# square = 0, circle = 1, triangle = 2
# red = 0, green = 1, blue = 2
LABELS = ['square', 'circle', 'triangle', 'NONE']
COLORS = ['red', 'green', 'blue', 'NONE']

def customCollate(batch):
    images = torch.stack([item[0] for item in batch])

    shapes = torch.stack([item[1]["shape"] for item in batch])
    colors = torch.stack([item[1]["color"] for item in batch])

    label_dict = {"shape": shapes, "color": colors}
    return images, label_dict


class CustomDataset(Dataset):
    def __init__(self, input_dir, label_csv, transform=transforms.ToTensor(), max_shapes=10):
        self.max_shapes = max_shapes
        self.transform = transform
        self.label_to_num = {"square" : 0, "circle" : 1, "triangle" : 2}
        self.color_to_num = {"red" : 0, "green" : 1, "blue" : 2}
        self.img_filenames = []
        self.img_labels = []
        
        files = os.listdir(input_dir)
        files = sorted(files, key=lambda x: int(x.split("_")[1].split(".")[0]))
        for img in files:
            self.img_filenames.append(os.path.join(input_dir, img))
        with open(label_csv, 'r') as file:
            reader = csv.reader(file)
            first = True
            for row in reader:
                if first:
                    first = False
                    continue
                self.img_labels.append(ast.literal_eval(row[1]))

    def __len__(self):
        return len(self.img_filenames)
    
    def __getitem__(self, idx):
        img_filename = self.img_filenames[idx]
        get_labels = self.img_labels[idx]

        image = cv2.imread(img_filename)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self.transform(image)

        shape_list = []
        color_list = []
        for i in range(self.max_shapes):
            if i < len(get_labels):
                shape, col = get_labels[i]
                shape_list.append(self.label_to_num[shape])
                color_list.append(self.color_to_num[col])
            else:
                shape_list.append(3)
                color_list.append(3)
        label_dict = {"shape": torch.from_numpy(np.array(shape_list)), "color": torch.from_numpy(np.array(color_list))}

        return image, label_dict

class CustomTestSet(Dataset):
    def __init__(self, input_dir, transform=transforms.ToTensor(), max_shapes=10):
        self.image_dir = input_dir
        self.max_shapes = max_shapes
        self.transform = transform
        self.img_filenames = []
        
        files = os.listdir(input_dir)
        files = sorted(files, key=lambda x: int(x.split("_")[1].split(".")[0]))
        for img in files:
            self.img_filenames.append(os.path.join(input_dir, img))

    def __len__(self):
        return len(self.img_filenames)
    
    def __getitem__(self, idx):
        img_filename = self.img_filenames[idx]

        image = cv2.imread(img_filename)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self.transform(image)

        relative_path = os.path.relpath(img_filename, start=self.image_dir)
        relative_path = relative_path.replace("\\", "/")
        relative_path = f"test_dataset/{relative_path}"

        return image, relative_path


In [4]:
train_dataset = CustomDataset(r"dataset_v3\train_dataset", r"dataset_v3\train.csv")
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0, collate_fn=customCollate, pin_memory=True)

# Custom Loss

I knew from the beginning that I wanted to use Cross-Entropy Loss, because I was doing multiclass classification. However, I eventually decided to perform the loss calculations separately. This decision was made when I changed my model to have two separate classification heads for shape and color. Originally, I had the total loss as a pure sum of the two losses, but I thought since color and shape were so closely related to each other, I would just average the two. If I'd found that my results were significantly skewed towards shape, I would have considered weighting each loss and then performing the sum. However, I deemed it necessary, since my by-hand inspection of the results seemed to yield results that weren't skewed.

In [5]:
def customLoss(pred, labels):
    pred_shapes = pred["shape"]
    pred_colors = pred["color"]
    label_shapes = labels["shape"]
    label_colors = labels["color"]

    criterion_shape = nn.CrossEntropyLoss()
    criterion_color = nn.CrossEntropyLoss()

    shape_loss = criterion_shape(pred_shapes.reshape(-1, 4), label_shapes.reshape(-1))
    color_loss = criterion_color(pred_colors.reshape(-1, 4), label_colors.reshape(-1))

    return (shape_loss + color_loss) / 2.0


# Model Architecture

I originally started with a model that had a ResNet50 backbone and only one classifier head on top. I attempted to train this a few times but my loss wasn't decreasing consistently. I found that the root cause was the lack of nonlinearity in my classifier, because it couldn't learn nonlinear features. So, I decided to add a second head (one for shape and one for color), and include nonlinearity. I found that the two heads, while not technically necessary (I could've just expanded the dimensions of one head to handle color and shape), it made performing the forward pass and calculating loss a lot easier.

As specified in the Kaggle instructions, this model (and the CustomDataset) can be trained and tested with a customizable number of shapes that can be predicted. I looked briefly through the train and test data, and decided on 10 as a reasonable maximum. In order to predict more/less, the model would need to be retrained with that number passed as an argument.

After deciding on a two-headed approach, and adapting my CustomDataset and customLoss function to handle that, I saw a better trend in my loss over ~10 epochs. But, it was still starting at a very high value, and not decreasing consistently (it would plateau around 0.3, after starting at 0.4). I realized that my CustomDataset was actually reading in the images in the wrong order. The line os.listdir() was reading them in string order, not in numerical order, like the labels were. So my bad trends in loss and overall wrong output was because I was training the model on incorrect labels. I fixed that, and while I was at it, added Kaiming initialization to my weights. In general, I've learned in my ML classes that randomly initializing weights leads to better performance, and I had used Kaiming initialization before, so I went with that.

I did consider adding Dropout or a Batchnorm layer, but after Kaiming initialization and correcting the way I was handling the input images and labels, I had a much better trend in my loss, so I didn't think it was necessary.

Originally, when my loss was plateauing around 0.3, I decided to add a learning rate scheduler to my optimizer, in case I was getting stuck at a local minimum. I like to use Cosine Annealing because I've encountered issues in the past where a scheduler that steps every epoch will reduce the learning rate too signficantly. I was plateauing early, so I wanted the steep drop at the beginning, and then for the learning rate to change only slightly once my loss started to get low. I think this was an advantageous addition to my training loop, but overall I know that fixing my image/labels issue is what made the most significant difference in model performance.

In [7]:
def kaimingInitWeights(seq_layer):
    if isinstance(seq_layer, nn.Linear):
        nn.init.kaiming_normal_(seq_layer.weight, nonlinearity='relu')
        if seq_layer.bias is not None:
            nn.init.constant_(seq_layer.bias, 0)


class ShapeColorModel(nn.Module):
    def __init__(self, max_shapes=10):
        super().__init__()
        self.max_shapes = max_shapes

        # Get resnet50 but remove the last layer 
        self.backbone = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)
        # To get output size (batch, 2048)
        self.backbone.fc = nn.Identity()
        self.backbone_dim = 2048

        # One head to predict each of shape and color
        self.shape_head = nn.Sequential(
            nn.Linear(self.backbone_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, self.max_shapes * 4)
        )

        self.color_head = nn.Sequential(
            nn.Linear(self.backbone_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, self.max_shapes * 4)
        )
        
        self.shape_head.apply(kaimingInitWeights)
        self.color_head.apply(kaimingInitWeights)
            
    def forward(self, x):
        features = self.backbone(x)
    
        # Get shape and color preds for each slot
        out_shape = self.shape_head(features).view(-1, self.max_shapes, 4) # Get to shape (batch_size, max_shapes, 4)
        out_color = self.color_head(features).view(-1, self.max_shapes, 4) # Get to shape (batch_size, max_shapes, 4)

        return {"shape" : out_shape, "color" : out_color}


# Training Loop

I decided to train for only 50 epochs for two main reasons. First, the dataset is small, so training for too long would likely lead to overfitting, and ruin the ability of the model to generalize to data it hasn't seen before. And second, I noticed my loss was getting very low after ~30-40 epochs. I was initially worried that it had already overfit to the training data, but upon inspection of the outputs, I found that it was actually able to generalize fairly well, so I decided to keep training at 50 epochs.

I also had access to a powerful GPU, so I made sure to run everything on CUDA, which saved me a lot of time. Moving every image and label from the CPU to GPU took a significant amount of time, but the overall difference it made when performing the backward pass was well worth it.

In [8]:
NUM_EPOCHS = 50

model = ShapeColorModel()
model.cuda()

print(next(model.shape_head.parameters()).device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS, eta_min=0.00001)

model.train()

device = torch.device('cuda')

for epoch in range(NUM_EPOCHS):
    epoch_loss = []
    # print("Entering batches")
    for i, (batch, labels) in enumerate(train_dataloader):
        # print("Starting batch to cuda")
        batch = batch.to(device, non_blocking=True)
        # print("Done with batch loading")
        labels = {k: v.to(device, non_blocking=True) for k, v in labels.items()}

        # print("Forward pass")
        pred = model(batch)
        optimizer.zero_grad()
        loss = customLoss(pred, labels)
        # print("Post loss")
        loss.backward()
        # print("Post backward pass")
        optimizer.step()
        
        epoch_loss.append(loss)
    
    scheduler.step()
    print(f"Avg Epoch Loss: {sum(epoch_loss) / len(epoch_loss)}")


cuda:0
Avg Epoch Loss: 0.17995864152908325
Avg Epoch Loss: 0.08428937196731567
Avg Epoch Loss: 0.06089802086353302
Avg Epoch Loss: 0.06278069317340851
Avg Epoch Loss: 0.05843469500541687
Avg Epoch Loss: 0.05334040895104408
Avg Epoch Loss: 0.045295123010873795
Avg Epoch Loss: 0.040356073528528214
Avg Epoch Loss: 0.0399656742811203
Avg Epoch Loss: 0.04457782581448555
Avg Epoch Loss: 0.04277442768216133
Avg Epoch Loss: 0.03840585798025131
Avg Epoch Loss: 0.03518952801823616
Avg Epoch Loss: 0.042878519743680954
Avg Epoch Loss: 0.04186081141233444
Avg Epoch Loss: 0.025221776217222214
Avg Epoch Loss: 0.016185007989406586
Avg Epoch Loss: 0.010437561199069023
Avg Epoch Loss: 0.009701056405901909
Avg Epoch Loss: 0.007365822326391935
Avg Epoch Loss: 0.004836840555071831
Avg Epoch Loss: 0.010315708816051483
Avg Epoch Loss: 0.007665099110454321
Avg Epoch Loss: 0.007265796419233084
Avg Epoch Loss: 0.0039217788726091385
Avg Epoch Loss: 0.0007361776079051197
Avg Epoch Loss: 0.0018080672016367316
Avg 

# Inference

For inference, I argmaxed on the dimension with the class assignments. If the model predicted background, I ignored the prediction. If the model predicted the same shape twice, I also ignored the prediction (as was specified in the Kaggle instructions).

In [None]:
test_dataset = CustomTestSet(r"dataset_v3\test_dataset")
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0, pin_memory=True)

model.eval()
model.cuda()

predictions = []

for i, (batch, label) in enumerate(test_dataloader):
        batch = batch.to(device, non_blocking=True)

        pred = model(batch)
        shape_preds = pred["shape"].squeeze(0).argmax(dim=1)
        color_preds = pred["color"].squeeze(0).argmax(dim=1)

        shape_preds = shape_preds.cpu().numpy()
        color_preds = color_preds.cpu().numpy()

        label_list = []
        for shape, color in zip(shape_preds, color_preds):
                if shape == 3 or color == 3:
                        continue
                if (LABELS[shape], COLORS[color]) in label_list:
                      continue
                label_list.append((LABELS[shape], COLORS[color]))
        predictions.append((label, str(label_list)))

with open("test_predictions_final.csv", "w", newline='') as f:
    writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["img_path", "label"])
    for filename, label_string in predictions:
        writer.writerow([filename[0], label_string])

# Save Model

In [11]:
torch.save(model.cpu().state_dict(), 'model_weights.pth')