In [None]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import transforms, datasets
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip "/content/drive/MyDrive/cnn_dataset.zip" -d "/content/"

In [None]:
batch_size = 64
lr = 0.001
epochs = 20
num_classes = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data_dir = '/content/cnn_dataset'
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5],
                         std=[0.5, 0.5, 0.5])
])
full_dataset = datasets.ImageFolder(root=data_dir, transform=transform)

In [None]:
from sklearn.model_selection import train_test_split
train_indices, test_indices = train_test_split(
    range(len(full_dataset)),
    test_size=0.15,
    random_state=42,
    stratify=[sample[1] for sample in full_dataset.samples]
)
train_indices, val_indices = train_test_split(
    train_indices,
    test_size=0.1765,
    random_state=42,
    stratify=[full_dataset.samples[i][1] for i in train_indices]
)

In [None]:
train_dataset = Subset(full_dataset, train_indices)
val_dataset   = Subset(full_dataset, val_indices)
test_dataset  = Subset(full_dataset, test_indices)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
class ResNeXtBottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_channels, out_channels, stride=1, cardinality=32, base_width=4, downsample=None):
        super(ResNeXtBottleneck, self).__init__()
        width = int(math.floor(out_channels * (base_width / 64.0))) * cardinality

        self.conv1 = nn.Conv2d(in_channels, width, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(width)
        self.conv2 = nn.Conv2d(width, width, kernel_size=3, stride=stride, padding=1,
                               groups=cardinality, bias=False)
        self.bn2 = nn.BatchNorm2d(width)
        self.conv3 = nn.Conv2d(width, out_channels * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)

        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class ResNeXt(nn.Module):
    def __init__(self, block, layers, cardinality=32, base_width=4, num_classes=num_classes):
        super(ResNeXt, self).__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(64)
        self.relu  = nn.ReLU(inplace=True)
        self.maxpool = nn.Identity()

        self.layer1 = self._make_layer(block, 64,  layers[0], stride=1, cardinality=cardinality, base_width=base_width)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, cardinality=cardinality, base_width=base_width)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, cardinality=cardinality, base_width=base_width)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, cardinality=cardinality, base_width=base_width)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride, cardinality, base_width):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion)
            )

        layers = []
        layers.append(block(self.in_channels, out_channels, stride, cardinality, base_width, downsample))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels, stride=1, cardinality=cardinality, base_width=base_width))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [None]:
model_resnext = ResNeXt(ResNeXtBottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, num_classes=num_classes)
model_resnext = model_resnext.to(device)


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_resnext.parameters(), lr=lr, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)

In [None]:
train_loss_list, val_loss_list = [], []
train_acc_list, val_acc_list = [], []
best_val_acc = 0.0

for epoch in range(epochs):
    model_resnext.train()
    running_train_loss = 0.0
    correct_train = 0
    total_train = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model_resnext(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    avg_train_loss = running_train_loss / len(train_loader)
    train_acc = 100.0 * correct_train / total_train

    model_resnext.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model_resnext(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()
            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()

    avg_val_loss = running_val_loss / len(val_loader)
    val_acc = 100.0 * correct_val / total_val

    train_loss_list.append(avg_train_loss)
    val_loss_list.append(avg_val_loss)
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)

    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch+1}/{epochs}: "
          f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.2f}% || "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.2f}%")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model_resnext.state_dict(), 'resnext_best.pth')

print(f"Training Complete. Best Validation Accuracy: {best_val_acc:.2f}%")

In [None]:
model_resnext.load_state_dict(torch.load('resnext_best.pth', map_location=device, weights_only=True))
model_resnext.eval()

test_loss = 0.0
correct_test = 0
total_test = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model_resnext(images)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total_test += labels.size(0)
        correct_test += predicted.eq(labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

avg_test_loss = test_loss / total_test
test_acc = 100.0 * correct_test / total_test

print(f"Test Loss: {avg_test_loss:.4f} | Test Accuracy: {test_acc:.2f}%")

| Model    | Test Accuracy | Test loss |
|----------|----------|----------|
| VGG     | 93.83  | 0.1848 |
| ResNet  | 93.73  | 0.1685  |
| ResNeXt  | 93.78  | 0.2067  |


In [None]:
plt.figure()
plt.plot(train_loss_list, label='Train Loss')
plt.plot(val_loss_list, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('ResNeXt Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(train_acc_list, label='Train Accuracy')
plt.plot(val_acc_list, label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('ResNeXt Training and Validation Accuracy')
plt.legend()
plt.show()

**Discussion**

- Here, the accuracy of ResNeXt is slighly better than ResNet. Although, the accuracy of VGG IS high. In part 1 we choose ResNet as our best model due to it's efficiency and slighly less loss compared to minor difference in accuracy of VGG. The design of ResNeXt with grouped convolutions and higher cardinality can capture more diverse feature representation and leads to better accuracy.

- Here, the loss of ResNeXt is definitely higher than the other models but if we tune model better than it is possible to get the higher accuracy with even less loss.

- The use of grouped convolutions in ResNeXt allow multiple parallel feature transformations. So, it is more powerful than the single path convolutions of ResNet. Here, both ResNet and ResNeXt use residual connections and it helps with gradient flow and convergence. But for this dataset and our configuration there is not much change in accuracy. Here, in ResNeXt there is concept of cardinality, which is number of parallel paths within convolutional layers. So, If we split 3 * 3 convolution into multiple groups, then the model will learn different features easily. This should lead to more accuracy but in our case there is very slight differece, which can be tuned to get higher accuracy.


**Challenges**

- Sensitivity to hyperparameters. Trying different learning rates, weight decay and the number of groups or cardinality required.

- The architecture is more complex so it took longer than the previous model of ResNet. For this it took 75 minutes with T4 GPU.

- ResNeXt also requires careful computation of the intermediate channel widths using base width and cardinality

**Results**

- ResNeXt has slighly higher accuracy than the ResNet but at the cost of high loss. So, our previous model of ResNet is better choice with less loss compared to this. This model can be improved further but we need to tune it even more and run more number of epochs for better comparison.

- The results means that even if the innovative grouped convolutions and increased cardinality in ResNeXt can be helpful, these benefits may only become significant in a case where the dataset complexity or training might challenges the model’s representational capacity. In our experiment, the improvements in feature diversity did not led to significant reduction in loss so we cannot say that it completely outperform ResNet, but if we tune it further it will definitely show the better results.



**References:**

https://arxiv.org/abs/1611.05431 <br>
https://matplotlib.org/stable/plot_types/index.html <br>
https://pytorch.org/docs/stable/index.html <br>
https://scikit-learn.org/stable/ <br>
https://pandas.pydata.org/docs/user_guide/index.html#user-guide