<h1><center> Group Assignment: Bird Classifier  </center></h1>

In [285]:
# Import PyTorch and supporting libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms

from PIL import Image

import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [362]:
# Use GPU if available, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [364]:
# Step 1: Load in data sets and set variables

# Load the CSV file
# train data
train_csv_path = "train_images.csv"
train_label_df = pd.read_csv(train_csv_path)

# test data
test_csv_path = "test_images_path.csv"  # Update this path
test_label_df = pd.read_csv(test_csv_path)



batch_size = 32
learning_rate = 0.001
num_epochs = 20

In [366]:
# Step 2: Create a Custom Dataset

class ImageLabelDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Args:
            csv_file (str): Path to the CSV file containing image paths and labels.
            transform (callable, optional): Transformations to apply to the images.
        """
        self.data = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        # Return the total number of samples
        return len(self.data)

    def __getitem__(self, idx):
        # Get the image path and label for the given index
        img_path = "./images/"+self.data.iloc[idx, 0]  # Image file path
        label = self.data.iloc[idx, 1]    # Corresponding label

        # Load the image
        image = Image.open(img_path).convert("RGB")  # Ensure 3 channels (RGB)

        # Apply transformations (if any)
        if self.transform:
            image = self.transform(image)
        
        # Convert label to tensor
        label = torch.tensor(label, dtype=torch.long)

        return image, label


In [368]:
# Step 3: Define Transformations

# Define transformations (resize, convert to tensor, normalize)
transform = transforms.Compose([
    transforms.Resize( (128, 128)),  # Resize to 128x128
    transforms.ToTensor(),          # Convert to tensor
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize values
])

In [370]:
# Step 5: Instantiate the Dataset and DataLoader

# Instantiate the dataset
train_dataset = ImageLabelDataset(csv_file=train_csv_path, transform=transform)
test_dataset = ImageLabelDataset(csv_file=test_csv_path, transform=transform)

# Create a DataLoader for batch processing
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False) # set batch size to 1 so that we check all 4000 images

Run the below code to see the difference if you.

In [373]:
#print(f"train dataset size: {len(train_dataset)}, train data size after data loader:  {len(train_dataloader)}")
#print(f"test dataset size: {len(test_dataset)}, test data size after data loader:  {len(test_dataloader)}") 

In [375]:
# Step 6: Verify the Dataset

# Iterate through the train DataLoader
for images, labels in train_dataloader:
    print(f"Batch of images: {images.shape}")  # Should be (batch_size, channels, height, width)
    print(f"Batch of labels: {labels}")       # Should match the batch_size
    break

# Iterate through the test DataLoader
for images, labels in test_dataloader:
    print(f"\nBatch of images: {images.shape}")  # Should be (batch_size, channels, height, width)
    print(f"Batch of labels: {labels}")       # Should match the batch_size
    break

Batch of images: torch.Size([32, 3, 128, 128])
Batch of labels: tensor([ 95, 103,  91,   6, 133, 102, 185,  25,  10, 106, 165,   5,   6,   6,
         84,  61, 120,  46,  42,  32, 103,  71, 195,  99,  58,  26, 159, 200,
         37,  32, 115,  91])

Batch of images: torch.Size([1, 3, 128, 128])
Batch of labels: tensor([1])


In [377]:
# Step 6: Create CNN model
## COMPLICATED MODEL! ##

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        # Define layers
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)  # Input channels: 3, Output channels: 16
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1) # Input: 16, Output: 32
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)                  # Downsample by 2x
        #self.fc1 = nn.Linear(32 * 32 * 32, 128)                              # Fully connected layer
        #self.fc2 = nn.Linear(128, 200)  # Output layer (200 classes)
        self.fc1 = nn.Linear(32 * 32 * 32, 128)
        self.dropout = nn.Dropout(0.5)  # Drop 50% of nodes during training
        self.fc2 = nn.Linear(128, 200)

    
    def forward(self, x):
        
        #print(f"Input shape: {x.shape}")  # Debugging input shape
        x = self.pool(torch.relu(self.conv1(x)))  # Conv1 -> ReLU -> MaxPool
        #print(f"After Conv1: {x.shape}")
        x = self.pool(torch.relu(self.conv2(x)))  # Conv2 -> ReLU -> MaxPool
        #print(f"After Conv2: {x.shape}")
        x = x.view(x.size(0), -1)                # Flatten for fully connected layers
        #print(f"After Flatten: {x.shape}")
        #x = torch.relu(self.fc1(x))              # FC1 -> ReLU
        #print(f"After Fully Connected: {x.shape}")
        #x = self.fc2(x) # FC2 (logits output)

        x = torch.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout
        x = self.fc2(x)

        return x

# Instantiate the model and move it to the appropriate device
model = CNN().to(device)

In [379]:
# Step 6: Create CNN model
## SIMPLIFIED MODEL! ##

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # Define layers
        self.conv1 = nn.Conv2d(3, 8, kernel_size=3, stride=1, padding=1)  # Reduce filters
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1)  # Reduce filters
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)  # Downsample by 2x
        #self.dropout = nn.Dropout(0.5)  # Add dropout to prevent overfitting
        self.fc1 = nn.Linear(16 * 32 * 32, 64)  # Reduce size of FC layer
        self.fc_combined = nn.Linear(64, 200)  # Output layer (200 classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # Conv1 -> ReLU -> Pool
        x = self.pool(F.relu(self.conv2(x)))  # Conv2 -> ReLU -> Pool
        x = x.view(x.size(0), -1)  # Flatten for fully connected layers
        x = F.relu(self.fc1(x))  # FC1 -> ReLU
        #x = self.dropout(x)  # Apply dropout
        x = self.fc2(x)

        
        return x

Simplemodel = SimpleCNN().to(device)


In [381]:
# Step 7: Set loss and optimiser function

# Loss function: CrossEntropyLoss (good for classification tasks)
criterion = nn.CrossEntropyLoss()

# Optimizer: Adam optimizer with a learning rate
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

In [383]:
# Step 8: Training loop

total = 0
correct = 0

for epoch in range(num_epochs):
    Simplemodel.train()  # Set model to training mode
    running_loss = 0.0

    for images, labels in train_dataloader:
        # Move data to device
        images, labels = images.to(device), labels.to(device)

        # Adjust labels to be in range [0, num_classes-1]
        adjusted_labels = labels - 1  # Assuming labels start from 1
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, adjusted_labels) # to combat the out of bounds issue I set labels - 1
        _, predicted = torch.max(outputs, 1)  # Get class with highest score

        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Accumulate loss for display
        running_loss += loss.item()

        total += labels.size(0)
        correct += (predicted + 1 == labels).sum().item()
        epoch_accuracy = 100 * correct / total
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_dataloader):.4f}, Accuracy: {epoch_accuracy:.2f}%")


Epoch [1/20], Loss: 5.2920, Accuracy: 0.43%
Epoch [2/20], Loss: 5.1906, Accuracy: 0.75%
Epoch [3/20], Loss: 5.0252, Accuracy: 1.45%
Epoch [4/20], Loss: 4.7496, Accuracy: 2.18%
Epoch [5/20], Loss: 4.3631, Accuracy: 3.50%
Epoch [6/20], Loss: 3.9049, Accuracy: 5.37%
Epoch [7/20], Loss: 3.4169, Accuracy: 7.76%
Epoch [8/20], Loss: 2.9618, Accuracy: 10.43%
Epoch [9/20], Loss: 2.4606, Accuracy: 13.57%
Epoch [10/20], Loss: 2.1032, Accuracy: 16.77%
Epoch [11/20], Loss: 1.7908, Accuracy: 19.99%
Epoch [12/20], Loss: 1.5680, Accuracy: 23.11%
Epoch [13/20], Loss: 1.3880, Accuracy: 26.03%
Epoch [14/20], Loss: 1.2241, Accuracy: 28.84%
Epoch [15/20], Loss: 1.1001, Accuracy: 31.50%
Epoch [16/20], Loss: 1.0176, Accuracy: 33.88%
Epoch [17/20], Loss: 0.9786, Accuracy: 36.10%
Epoch [18/20], Loss: 0.8827, Accuracy: 38.22%
Epoch [19/20], Loss: 0.8526, Accuracy: 40.16%
Epoch [20/20], Loss: 0.8144, Accuracy: 41.94%


In [389]:
#Step 9: Evaluation loop

model.eval()  # Set model to evaluation mode
predictions = []  # To store predictions
count = 0 # this keeps track of which image we are currently predicting for

with torch.no_grad():  # No need to compute gradients during evaluation
    for images, labels in test_dataloader: 
        
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)  # Get predicted class

        # Add 1 to align predicted labels with the expected range
        predicted += 1

        # get the current images' image path
        image_path = test_label_df["image_path"][count]

        # Extract the file name without the extension
        image_name = os.path.splitext(os.path.basename(image_path))[0]
        
        # Store the image path and prediction
        predictions.append((image_name, predicted.item()))
        count += 1

# Save predictions to Excel
df = pd.DataFrame(predictions, columns=['id', 'label'])
df.to_csv("predictions.csv", index=False)


Below is a parameter log with their kaggle results.

The format will always be training batch size, learing rate and number of epochs, optimiser :-
1. batch size = 32, learning rate = 0.001, number of epochs = 15, optimizer = ADAM, Kaggle score = 0.00525
2. batch size = 32, learning rate = 0.001, number of epochs = 15, optimizer = ADAM, Kaggle score = 0.00500
3. batch size = 32, learning rate = 0.001, number of epochs = 10, optimizer = ADAM, Kaggle score = 0.00600
5. batch size = 32, learning rate = 0.001, number of epochs = 8, optimizer = ADAM, Kaggle score = 0.00525
6. batch size = 4, learning rate = 0.001, number of epochs = 20, optimizer = SDG, Kaggle score = 0.00600
7. batch size = 32, learning rate = 0.001, number of epochs = 20, optimizer = ADAM, Kaggle score = 0.00525
8. batch size = 32, learning rate = 0.001, number of epochs = 20, optimizer = ADAM(weighted), Kaggle score = 0.00750
9. (Simple)batch size = 32, learning rate = 0.001, number of epochs = 20, optimizer = ADAM, Kaggle score = 0.00425
10. Resnet - Transfer learning, Kaggle score = 0.0700
11. (Simple)batch size = 32, learning rate = 0.001, number of epochs = 50, optimizer = ADAM(weighted), Kaggle score = 0.00850
12. (Simple)batch size = 32, learning rate = 0.001, number of epochs = 20, optimizer = ADAM, Kaggle score = 0.00325