In [7]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, random_split

import tqdm

## Preparing the Dataset

In [None]:
IMG_SIZE = 224

# Define transformations
train_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])  # Normalize to [-1,1] range
])

test_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

In [1]:
TRAIN_DATA_PATH = "/path/to/your/dataset"
TEST_DATA_PATH = "/Users/binit/PycharmProjects/FinalProject/Project_File/data"
# Load dataset
train_dataset = datasets.ImageFolder(root=TRAIN_DATA_PATH + "/train", transform=train_transforms)
test_dataset = datasets.ImageFolder(root=TEST_DATA_PATH + "/test", transform=test_transforms)

# Create DataLoader
BATCH_SIZE = 32

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# Print class names
print(f"Classes: {train_dataset.classes}")
NUM_CLASSES = len(train_dataset.classes)

## Model Building

In [2]:
class CBAM(nn.Module):

  def __init__(self, in_channels):
    super(CBAM, self).__init__()
    self.channel_attention = nn.Sequential( ## this is for channel
        nn.AdaptiveAvgPool2d(1),   ## input (32, 128, 8, 8) --> output (32, 128, 1, 1)
        nn.Conv2d(in_channels, in_channels //8, 1), ## input as above , output as (32, 16, 1, 1)
        nn.ReLU(),       ## output same
        nn.Conv2d(in_channels // 8, in_channels, 1),   ##input as above --> output (32, 128, 1, 1)
        nn.Sigmoid()
    )
    self.spatial_attention = nn.Sequential(nn.Conv2d(2, 1, kernel_size=7, padding=3), ## this is for spatial
                                                     nn.Sigmoid())

  def forward(self, x):
    # channel attention
    x_channel = self.channel_attention(x) * x  ## output --> (32, 128, 8,8)

    # spatial attention
    avg_out = torch.mean(x_channel, dim=1, keepdim=True) ## output --> (32, 1,8, 8)
    max_out, _ = torch.max(x_channel, dim=1, keepdim=True) ##output --> (32, 1, 8, 8)
    x_spatial = torch.cat([avg_out, max_out], dim =1) ##output --> (32, 2, 8, 8)
    spatial_attention = self.spatial_attention(x_spatial) ##output --> (32, 1, 8, 8)

    return x_channel * spatial_attention ##(32, 128, 8, 8) * (32, 1, 8, 8) | output will be (32, 128, 8, 8)

In [3]:
class SelfAttention(nn.Module):

  def __init__(self, in_channels):
    super(SelfAttention, self).__init__()
    self.query = nn.Conv2d(in_channels, in_channels//8, 1) ## output (32, 16, ,8, 8)
    self.keys = nn.Conv2d(in_channels, in_channels//8, 1)
    self.value = nn.Conv2d(in_channels, in_channels, 1) ## output (32, 128, 8, 8)
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, x):
    batch, channels, height, width = x.size() ## input (32, 128, 8, 8)

    query = self.query(x).view(batch, -1, width*height) ## input (32, 128, 8, 8) --> self.query--> output(32, 16,8,8)--> output (32,16,64)
    key = self.keys(x).view(batch, -1, width*height) ## output (32, 16, 64)

    attention = self.softmax(torch.bmm(key.permute(0, 2, 1), query))  ## output --> (32, 64, 64)

    value = self.value(x).view(batch, -1, width * height) ##  (32, 128, 64) @ (32, 64, 64) --> output ( 32, 128, 64)
    out  = torch.bmm(value, attention.permute(0,2,1)).view(batch, channels, height, width)
    ## (32, 128, 64) @ (32, 64, 64) --> output ( 32, 128, 64) again we view it so final_output (32, 128, 8,8)

    return out + x

In [4]:
class CNN_Attention(nn.Module):

  def __init__(self, num_classes=7):
    super(CNN_Attention, self).__init__()
    self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1) ##output --> (32, 32, ,32, 32) -> batch_size, 32 will be output/i.e. 32 features extracted images or channel increased from 3 to 32/ 32 x 32 is h,w
    self.bn1 = nn.BatchNorm2d(32)

    self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1) ##output --> (32, 64, 32, 32) -> batch_size, channels increased from 32 to 64, h, w
    self.bn2 = nn.BatchNorm2d(64)

    self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1) ##output --> (32, 128, 32, 32)
    self.bn3 = nn.BatchNorm2d(128)

    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    # self.se_block = SqueezeExcite(128) ## this can be redundant as CBAM already includes this
    self.cbam_block = CBAM(128)
    self.attention_block = SelfAttention(128)

    self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
    self.dropout = nn.Dropout(0.5)
    self.fc = nn.Linear(128, num_classes)

  def forward(self, x):
    x = F.relu(self.bn1(self.conv1(x)))
    x = self.pool(F.relu(self.bn2(self.conv2(x)))) ##conv2 output is (32,64,32,32) --> when it is goes to pool output is --> (32, 64, 16, 16)
    x = self.pool(F.relu(self.bn3(self.conv3(x))))  ##input (32,64, 16,16) --> conv3 -->output (32, 128, 16, 16) --> pool output -->((32, 128, 8, 8))

    # x = self.se_block(x) ## Applying SE attention
    x = self.cbam_block(x) # Applying CBAM attention ##input->(32,128, 8, 8). | output --> (32, 128,8, 8)

    x = self.attention_block(x) ##output --> (32, 128, 8, 8)

    x = self.global_avg_pool(x).view(x.size(0), -1)
    x = self.dropout(x)

    x = self.fc(x)

    return x

In [5]:
#Intialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN_Attention(num_classes=10).to(device)

# Loss Function, since its a mutliclass
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Learning Rate Scheduler (Optional)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params}")

Total trainable parameters: 119965


## Training

In [9]:
#Intialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN_Attention(num_classes=10).to(device)

# Loss Function, since its a mutliclass
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Learning Rate Scheduler (Optional)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params}")

Total trainable parameters: 119965


In [8]:
EPOCHS = 30

for epoch in range(EPOCHS):
    ###### TRAINING PHASE ######
    model.train()  # Set model to training mode
    running_loss = 0.0
    correct = 0
    total = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)

    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)

        # Forward Pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track training statistics
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

        progress_bar.set_postfix(loss=running_loss / len(train_loader), acc=100 * correct / total)

    train_loss = running_loss / len(train_loader)
    train_acc = 100 * correct / total

    ###### VALIDATION PHASE ######
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():  # Disable gradient calculations
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Track validation statistics
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.size(0)

    val_loss /= len(val_loader)
    val_acc = 100 * val_correct / val_total

    ###### PRINT EPOCH SUMMARY ######
    print(f"Epoch {epoch+1}/{EPOCHS} -> "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}% | "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

    scheduler.step()  # Update Learning Rate

NameError: name 'train_loader' is not defined

## Validation & Evaluation

In [None]:
def evaluate(model, dataloader):
    model.eval()  # Set to evaluation mode
    correct = 0
    total = 0
    loss_total = 0.0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)

            # Compute loss
            loss = criterion(outputs, labels)
            loss_total += loss.item()

            # Compute accuracy
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = 100 * correct / total
    print(f"Validation Loss: {loss_total/len(dataloader):.4f}, Accuracy: {accuracy:.2f}%")
    return accuracy

# Run evaluation on test dataset
evaluate(model, test_loader)

## Visualization of Training

In [None]:
def show_cam_on_image(img, mask):
    """ Overlay the CAM heatmap on the image """
    heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
    heatmap = np.float32(heatmap) / 255
    cam = heatmap + np.float32(img)
    cam = cam / np.max(cam)
    return cam

def get_grad_cam(model, image, class_idx):
    """ Compute Grad-CAM heatmap """
    model.eval()
    image = image.unsqueeze(0).to(device)

    # Forward pass
    output = model(image)
    model.zero_grad()
    output[0, class_idx].backward()

    # Get gradients
    gradients = model.conv3.weight.grad.cpu().data.numpy()
    activations = model.conv3(image).cpu().data.numpy()

    # Compute Grad-CAM
    weights = np.mean(gradients, axis=(2, 3))
    cam = np.sum(weights[:, :, None, None] * activations, axis=1)
    cam = np.maximum(cam, 0)
    cam = cam / np.max(cam)

    return cam

# Select a test image
sample_image, sample_label = test_dataset[10]
cam = get_grad_cam(model, sample_image, sample_label)

# Display
plt.imshow(show_cam_on_image(sample_image.permute(1, 2, 0).numpy(), cam[0]), cmap='jet')
plt.show()