In [4]:
import torch
from torch.utils.data import DataLoader
from torch import nn
from collections import OrderedDict
from torchvision import transforms, datasets
from torch.utils.data import Subset

class BottleNeckBlock(nn.Module):
    def __init__(self, in_channels, out_channel, stride=1):
        super().__init__()
        self.layers = nn.Sequential(OrderedDict([
            ('conv1_1', nn.Conv2d(in_channels, out_channel // 4, kernel_size=(1, 1))),
            ('bn1_1', nn.BatchNorm2d(out_channel // 4)),
            ('relu1_1', nn.ReLU()),
            ('conv1_2',  nn.Conv2d(out_channel // 4, out_channel // 4, kernel_size=(3, 3), stride=stride, padding=1)),
            ('bn1_2', nn.BatchNorm2d(out_channel // 4)),
            ('relu1_2', nn.ReLU()),
            ('conv1_3', nn.Conv2d(out_channel // 4, out_channel, kernel_size=(1, 1))),
            ('bn1_3', nn.BatchNorm2d(out_channel)),

        ]))
        self.last_relu = nn.ReLU()
        if stride != 1 or in_channels != out_channel:
            self.shortcut = nn.Sequential(OrderedDict([
                ('sho_conv_1', nn.Conv2d(in_channels, out_channel, kernel_size=(1, 1), stride=stride)),
                ('sho_bn1', nn.BatchNorm2d(out_channel))
            ]))
        else:
            self.shortcut = nn.Identity()
    
    def forward(self, x):
        x = self.layers(x) + self.shortcut(x)
        x = self.last_relu(x)
        return x

class ResNet50(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(OrderedDict([
            ('conv1_1', nn.Conv2d(3, 64, kernel_size=(3, 3), stride=2)),
            ('bn1_1', nn.BatchNorm2d(64)),
            ('relu1_1', nn.ReLU()),
            ('block1', BottleNeckBlock(in_channels=64, out_channel=256)),
            ('block2', BottleNeckBlock(in_channels=256, out_channel=256)),
            ('block3', BottleNeckBlock(in_channels=256, out_channel=256)),
            ('block4', BottleNeckBlock(in_channels=256, out_channel=512, stride=2)),

            ('block5', BottleNeckBlock(in_channels=512, out_channel=512)),
            ('block6', BottleNeckBlock(in_channels=512, out_channel=512)),
            ('block7', BottleNeckBlock(in_channels=512, out_channel=512)),
            ('block8', BottleNeckBlock(in_channels=512, out_channel=1024, stride=2)),
            
            ('block9', BottleNeckBlock(in_channels=1024, out_channel=1024)),
            ('block10', BottleNeckBlock(in_channels=1024, out_channel=1024)),
            ('block11', BottleNeckBlock(in_channels=1024, out_channel=1024)),
            ('block12', BottleNeckBlock(in_channels=1024, out_channel=1024)),
            ('block13', BottleNeckBlock(in_channels=1024, out_channel=1024, stride=2)),

            ('block14', BottleNeckBlock(in_channels=1024, out_channel=2048)),
            ('block15', BottleNeckBlock(in_channels=2048, out_channel=2048)),
            ('block16', BottleNeckBlock(in_channels=2048, out_channel=2048)),

        ]))
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.4)
        self.linear = nn.Linear(2048, 100)
    
    def forward(self, x):
        x = self.layers(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [5]:
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('high')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    torch.cuda.empty_cache()

train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Mild to avoid over-distortion
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5071, 0.4865, 0.4409], std=[0.2673, 0.2564, 0.2761]),
    transforms.RandomErasing(p=0.7)  # Apply after normalization for consistency
])

test_transform = transforms.Compose([
    transforms.ToTensor(), # Moved ToTensor before Normalize (good practice)
    transforms.Normalize(mean=[0.5071, 0.4867, 0.4408], std=[0.2675, 0.2565, 0.2761])
])

# Load raw datasets
cifar_train_raw = datasets.CIFAR100(root="./data", train=True, download=True, transform=None)

train_size = int(0.9 * len(cifar_train_raw))  # 48,000

train_indices = list(range(0, train_size))
val_indices = list(range(train_size, len(cifar_train_raw)))

# Create datasets with appropriate transforms
cifar_train = Subset(
    datasets.CIFAR100(root="./data", train=True, transform=train_transform),
    train_indices
)
cifar_val = Subset(
    datasets.CIFAR100(root="./data", train=True, transform=test_transform),
    val_indices
)
# Use original test set (10,000 samples) - close to 10% of 60,000
cifar_test = datasets.CIFAR100(root="./data", train=False, transform=test_transform)

train_loader = DataLoader(
    cifar_train,  # Use directly
    batch_size=1024,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=6
)

val_loader = DataLoader(
    cifar_val,  # Use directly
    batch_size=1024,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=6
)

test_loader = DataLoader(
    cifar_test,
    batch_size=1024,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    persistent_workers=True,
    prefetch_factor=6
)

num_classes = 100

resnet = ResNet50().to(device)

num_epochs = 40
loss_function = nn.CrossEntropyLoss(label_smoothing=0.1)
base_lr = 4e-3

batch_scale = 1024 / 256  # 4x larger batches
scaled_lr = base_lr * batch_scale**0.5  # Square root scaling

optimizer = torch.optim.AdamW(
    resnet.parameters(),
    lr=3e-3,  # Keep this for now, let OneCycleLR handle it
    weight_decay=2e-3
)

scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=4e-3,                # Slightly lower peak LR for stability
    epochs=num_epochs,          # Keep 45 epochs
    steps_per_epoch=len(train_loader),
    pct_start=0.4,              # Increase warmup to 40% (18 epochs)
    anneal_strategy='cos',
    div_factor=12.0,            # Start LR = 5e-3 / 12 = 4.2e-4
    final_div_factor=400.0      # Final LR = 5e-3 / 400 = 1.25e-5
)

best_val_loss = float('inf')

for epoch in range(num_epochs):
    print(f'Starting Epoch {epoch+1}')
    resnet.train()

    current_loss = 0.0
    num_batches = 0

    for i, data in enumerate(train_loader):
        inputs, targets = data
        inputs, targets = inputs.to(device), targets.to(device)
            
        outputs = resnet(inputs)
        loss = loss_function(outputs, targets)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

        current_loss += loss.item()
        num_batches += 1

        if i % 50 == 0:
            print(f'Batch {i}/{len(train_loader)}, Loss: {loss.item():.4f}')


    avg_train_loss = current_loss / num_batches
    print(f'Epoch {epoch+1} finished')
    print(f'Training - Loss: {avg_train_loss:.4f}')

    if (epoch + 1) % 2 == 0:
        resnet.eval()
        val_loss = 0.0
        val_batches = 0

        print(f'Epoch {epoch+1} finished')
        print(f'average training loss is {avg_train_loss:.4f}')

        with torch.no_grad():
            for val_data in val_loader:
                val_inputs, val_targets = val_data
                val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)  # Convert inputs to FP16

                val_outputs = resnet(val_inputs)
                val_batch_loss = loss_function(val_outputs, val_targets)

                val_loss += val_batch_loss.item()
                val_batches += 1


        avg_val_loss = val_loss / val_batches

        print(f'Epoch {epoch+1} finished')
        print(f'Training - Loss: {avg_train_loss:.4f}')
        print(f'Validation - Loss: {avg_val_loss:.4f}')

if torch.cuda.is_available():
    torch.cuda.empty_cache()


Using device: cuda
GPU Memory: 15.8 GB
Starting Epoch 1
Batch 0/44, Loss: 4.9940
Epoch 1 finished
Training - Loss: 4.7035
Starting Epoch 2
Batch 0/44, Loss: 4.5979
Epoch 2 finished
Training - Loss: 4.4422
Epoch 2 finished
average training loss is 4.4422
Epoch 2 finished
Training - Loss: 4.4422
Validation - Loss: 4.3492
Starting Epoch 3
Batch 0/44, Loss: 4.2590
Epoch 3 finished
Training - Loss: 4.1690
Starting Epoch 4
Batch 0/44, Loss: 4.1240
Epoch 4 finished
Training - Loss: 4.0634
Epoch 4 finished
average training loss is 4.0634
Epoch 4 finished
Training - Loss: 4.0634
Validation - Loss: 9.5068
Starting Epoch 5
Batch 0/44, Loss: 3.9288
Epoch 5 finished
Training - Loss: 3.9328
Starting Epoch 6
Batch 0/44, Loss: 3.8484
Epoch 6 finished
Training - Loss: 3.8032
Epoch 6 finished
average training loss is 3.8032
Epoch 6 finished
Training - Loss: 3.8032
Validation - Loss: 3.7637
Starting Epoch 7
Batch 0/44, Loss: 3.7167
Epoch 7 finished
Training - Loss: 3.6773
Starting Epoch 8
Batch 0/44, Los

In [6]:
def evaluate_test_set(model):
    model.eval()
    correct = 0
    total = 0
    
    print("Starting evaluation...")
    
    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            
            # Use autocast for consistency if you trained with it
            outputs = model(images)
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy of the network on the test images: {accuracy:.2f}%')
    return accuracy

print("\n=== Running standard evaluation ===")
standard_accuracy = evaluate_test_set(resnet)   
print(f'Standard Test Accuracy: {standard_accuracy:.4f} ({standard_accuracy:.2f}%)')


=== Running standard evaluation ===
Starting evaluation...
Accuracy of the network on the test images: 63.50%
Standard Test Accuracy: 63.5000 (63.50%)
