# Setup and Imports

In [None]:
!nvidia-smi

Tue May  6 14:23:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   53C    P0             29W /   70W |     864MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!pip install ptflops tabulate



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
import time
from ptflops import get_model_complexity_info

# Implementation

### Model Architecture

In [None]:
class SpatialReconstructionUnit(nn.Module):

    def __init__(self, channels, gn_groups=32, threshold=0.5):
        super().__init__()

        self.gn = nn.GroupNorm(num_groups=gn_groups, num_channels=channels)
        self.threshold = threshold

    def forward(self, x):
        N, C, H, W = x.shape
        x_norm = self.gn(x)
        gamma = self.gn.weight.detach()
        gamma = torch.clamp(gamma, min=1e-6)
        w_gamma = gamma / (gamma.sum() + 1e-6)
        mask = torch.sigmoid(w_gamma.view(1, C, 1, 1))
        mask1 = (mask > self.threshold).float()
        mask2 = (mask <= self.threshold).float()
        x1 = x * mask1
        x2 = x * mask2
        x1_recon = x1 + x2
        x2_recon = x2 + x1

        return torch.cat([x1_recon, x2_recon], dim=1)

In [None]:
class ChannelReconstructionUnit(nn.Module):

    def __init__(self, in_channels, out_channels, split_ratio=0.5, squeeze_ratio=2, gwc_groups=2):
        super().__init__()

        self.alpha = split_ratio
        C1 = int(self.alpha * in_channels)
        C2 = in_channels - C1
        C1s = C1 // squeeze_ratio
        C2s = C2 // squeeze_ratio
        self.squeeze1 = nn.Conv2d(C1, C1s, 1)
        self.squeeze2 = nn.Conv2d(C2, C2s, 1)
        self.gwc = nn.Conv2d(C1s, out_channels, 3, padding=1, groups=gwc_groups)
        self.pw1 = nn.Conv2d(C1s, out_channels, 1)
        self.pw2 = nn.Conv2d(C2s, out_channels, 1)

    def forward(self, x):
        C = x.size(1)
        C1 = int(self.alpha * C)
        x1, x2 = x[:, :C1], x[:, C1:]
        xs1, xs2 = self.squeeze1(x1), self.squeeze2(x2)
        y1 = self.gwc(xs1) + self.pw1(xs1)
        y2 = self.pw2(xs2)
        s1 = F.adaptive_avg_pool2d(y1, 1)
        s2 = F.adaptive_avg_pool2d(y2, 1)
        score = torch.cat([s1, s2], dim=1)
        attn = F.softmax(score, dim=1)
        beta1, beta2 = torch.split(attn, s1.size(1), dim=1)

        return beta1 * y1 + beta2 * y2

In [None]:
class SCConv(nn.Module):

    def __init__(self, in_channels, out_channels,
                 gn_groups=32, threshold=0.5,
                 split_ratio=0.5, squeeze_ratio=2, gwc_groups=2):
        super().__init__()

        self.sru = SpatialReconstructionUnit(in_channels, gn_groups, threshold)
        self.cru = ChannelReconstructionUnit(in_channels*2, out_channels,
                                             split_ratio, squeeze_ratio, gwc_groups)

    def forward(self, x):
        return self.cru(self.sru(x))

In [None]:
def conv1x1(in_planes, out_planes, stride=1): return nn.Conv2d(in_planes, out_planes, 1, stride=stride, bias=False)

In [None]:
class BottleneckSC(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super().__init__()

        self.conv1 = conv1x1(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.scconv = SCConv(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = conv1x1(planes, planes*4)
        self.bn3 = nn.BatchNorm2d(planes*4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.scconv(out)))
        out = self.bn3(self.conv3(out))

        if self.downsample: identity = self.downsample(x)

        return self.relu(out + identity)

In [None]:
class ResNetSC(nn.Module):

    def __init__(self, layers, num_classes=100):
        super().__init__()

        self.inplanes = 64
        self.conv1 = nn.Conv2d(3, 64, 7, 2, 3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(3, 2, 1)
        self.layer1 = self._make_layer(64, layers[0])
        self.layer2 = self._make_layer(128, layers[1], stride=2)
        self.layer3 = self._make_layer(256, layers[2], stride=2)
        self.layer4 = self._make_layer(512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(512*4, num_classes)

    def _make_layer(self, planes, blocks, stride=1):
        downsample = None

        if stride!=1 or self.inplanes!=planes*4:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes*4, stride),
                nn.BatchNorm2d(planes*4)
            )

        layers = [BottleneckSC(self.inplanes, planes, stride, downsample)]
        self.inplanes = planes*4

        for _ in range(1, blocks):
            layers.append(BottleneckSC(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.maxpool(x)
        x = self.layer1(x); x = self.layer2(x)
        x = self.layer3(x); x = self.layer4(x)
        x = self.avgpool(x); x = torch.flatten(x,1)

        return self.fc(x)

# Model Training

In [None]:
# torch.autograd.set_detect_anomaly(True)
def train_and_evaluate_10(use_scconv=False, epochs=60):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5071,0.4867,0.4408),(0.2675,0.2565,0.2761))
    ])

    trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    testset = datasets.CIFAR10(root='./data', train=False, download=True,
                                transform=transforms.Compose([
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.5071,0.4867,0.4408),(0.2675,0.2565,0.2761))
                                ]))

    trainloader = DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
    testloader = DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)
    model = ResNetSC([3,4,6,3], num_classes=10).to(device) if use_scconv else models.resnet50(num_classes=10).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
    # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100,150], gamma=0.1)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

    for epoch in range(epochs):
        model.train(); running_loss=0

        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad(); outputs=model(inputs)
            loss = criterion(outputs, labels); loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step(); running_loss += loss.item()

        scheduler.step()

        print(f"> Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(trainloader):.4f}")

    model.eval(); correct=0; total=0

    with torch.no_grad():

        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs); _, preds = outputs.max(1)
            correct += preds.eq(labels).sum().item(); total += labels.size(0)

    return 100. * correct / total, model

In [None]:
# torch.autograd.set_detect_anomaly(True)
def train_and_evaluate_100(use_scconv=False, epochs=30):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5071,0.4867,0.4408),(0.2675,0.2565,0.2761))
    ])

    trainset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
    testset = datasets.CIFAR100(root='./data', train=False, download=True,
                                transform=transforms.Compose([
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.5071,0.4867,0.4408),(0.2675,0.2565,0.2761))
                                ]))

    trainloader = DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
    testloader = DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)
    model = ResNetSC([3,4,6,3]).to(device) if use_scconv else models.resnet50(num_classes=100).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
    # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100,150], gamma=0.1)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

    for epoch in range(epochs):
        model.train(); running_loss=0

        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad(); outputs=model(inputs)
            loss = criterion(outputs, labels); loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step(); running_loss += loss.item()

        scheduler.step()

        print(f"> Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(trainloader):.4f}")

    model.eval(); correct=0; total=0

    with torch.no_grad():

        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs); _, preds = outputs.max(1)
            correct += preds.eq(labels).sum().item(); total += labels.size(0)

    return 100. * correct / total, model

# Driver Code

In [None]:
if __name__ == '__main__':

    from tabulate import tabulate

    print("Evaluating Baseline ResNet50...")
    start_base = time.time()
    base_acc, base_model = train_and_evaluate_10(use_scconv=False)
    end_base = time.time()

    print("\nEvaluating SCConv-ResNet50...")
    start_sc = time.time()
    scconv_acc, scconv_model = train_and_evaluate_10(use_scconv=True)
    end_sc = time.time()
    acc_gain = scconv_acc - base_acc
    time_diff = (end_sc - start_sc) - (end_base - start_base)
    time_ratio = (end_sc - start_sc) / (end_base - start_base)
    headers = ["Model", "Accuracy (%)", "Training Time (min)", "Params (M)", "FLOPs (G)"]

    def get_stats(model):

        with torch.cuda.device(0):
            macs, params = get_model_complexity_info(model, (3, 32, 32), as_strings=False, print_per_layer_stat=False)

            return round(params / 1e6, 2), round(macs / 1e9, 2)

    base_params, base_flops = get_stats(base_model)
    sc_params, sc_flops = get_stats(scconv_model)
    rows = [
        ["ResNet-50 (Baseline)", f"{base_acc:.2f}", f"{(end_base-start_base)/60:.2f}", f"{base_params}", f"{base_flops}"],
        ["SCConv-ResNet-50", f"{scconv_acc:.2f}", f"{(end_sc-start_sc)/60:.2f}", f"{sc_params}", f"{sc_flops}"],
    ]

    print("\nSummary:\n")
    print(tabulate(rows, headers=headers, tablefmt="fancy_grid"))
    print("\nComparative Analysis:")
    print(f"Accuracy Gain:      {acc_gain:+.2f}%")
    print(f"Time Increase:      {time_diff:+.2f} sec ({time_ratio:.2f}× slower)")

Evaluating Baseline ResNet50...
> Epoch 1/60, Loss: 2.2999
> Epoch 2/60, Loss: 2.0360
> Epoch 3/60, Loss: 1.8608
> Epoch 4/60, Loss: 1.6920
> Epoch 5/60, Loss: 1.5395
> Epoch 6/60, Loss: 1.4055
> Epoch 7/60, Loss: 1.2741
> Epoch 8/60, Loss: 1.1522
> Epoch 9/60, Loss: 1.0653
> Epoch 10/60, Loss: 0.9679
> Epoch 11/60, Loss: 0.8967
> Epoch 12/60, Loss: 0.8567
> Epoch 13/60, Loss: 0.8251
> Epoch 14/60, Loss: 0.7711
> Epoch 15/60, Loss: 0.7287
> Epoch 16/60, Loss: 0.7010
> Epoch 17/60, Loss: 0.6825
> Epoch 18/60, Loss: 0.6544
> Epoch 19/60, Loss: 0.6299
> Epoch 20/60, Loss: 0.6062
> Epoch 21/60, Loss: 0.5042
> Epoch 22/60, Loss: 0.4676
> Epoch 23/60, Loss: 0.4545
> Epoch 24/60, Loss: 0.4543
> Epoch 25/60, Loss: 0.4377
> Epoch 26/60, Loss: 0.4289
> Epoch 27/60, Loss: 0.4198
> Epoch 28/60, Loss: 0.4171
> Epoch 29/60, Loss: 0.4096
> Epoch 30/60, Loss: 0.4074
> Epoch 31/60, Loss: 0.3963
> Epoch 32/60, Loss: 0.3945
> Epoch 33/60, Loss: 0.3833
> Epoch 34/60, Loss: 0.3771
> Epoch 35/60, Loss: 0.37

In [None]:
if __name__ == '__main__':

    from tabulate import tabulate

    print("Evaluating Baseline ResNet50...")
    start_base = time.time()
    base_acc, base_model = train_and_evaluate_100(use_scconv=False)
    end_base = time.time()

    print("\nEvaluating SCConv-ResNet50...")
    start_sc = time.time()
    scconv_acc, scconv_model = train_and_evaluate_100(use_scconv=True)
    end_sc = time.time()
    acc_gain = scconv_acc - base_acc
    time_diff = (end_sc - start_sc) - (end_base - start_base)
    time_ratio = (end_sc - start_sc) / (end_base - start_base)
    headers = ["Model", "Accuracy (%)", "Training Time (min)", "Params (M)", "FLOPs (G)"]

    def get_stats(model):

        with torch.cuda.device(0):
            macs, params = get_model_complexity_info(model, (3, 32, 32), as_strings=False, print_per_layer_stat=False)

            return round(params / 1e6, 2), round(macs / 1e9, 2)

    base_params, base_flops = get_stats(base_model)
    sc_params, sc_flops = get_stats(scconv_model)
    rows = [
        ["ResNet-50 (Baseline)", f"{base_acc:.2f}", f"{(end_base-start_base)/60:.2f}", f"{base_params}", f"{base_flops}"],
        ["SCConv-ResNet-50", f"{scconv_acc:.2f}", f"{(end_sc-start_sc)/60:.2f}", f"{sc_params}", f"{sc_flops}"],
    ]

    print("\nSummary:\n")
    print(tabulate(rows, headers=headers, tablefmt="fancy_grid"))
    print("\nComparative Analysis:")
    print(f"Accuracy Gain:      {acc_gain:+.2f}%")
    print(f"Time Increase:      {time_diff:+.2f} sec ({time_ratio:.2f}× slower)")

Evaluating Baseline ResNet50...
> Epoch 1/30, Loss: 4.6735
> Epoch 2/30, Loss: 4.1758
> Epoch 3/30, Loss: 3.7707
> Epoch 4/30, Loss: 3.4680
> Epoch 5/30, Loss: 3.2294
> Epoch 6/30, Loss: 3.0319
> Epoch 7/30, Loss: 2.8699
> Epoch 8/30, Loss: 2.7406
> Epoch 9/30, Loss: 2.6268
> Epoch 10/30, Loss: 2.5031
> Epoch 11/30, Loss: 2.4083
> Epoch 12/30, Loss: 2.3169
> Epoch 13/30, Loss: 2.2137
> Epoch 14/30, Loss: 2.1361
> Epoch 15/30, Loss: 2.0440
> Epoch 16/30, Loss: 1.9689
> Epoch 17/30, Loss: 1.8842
> Epoch 18/30, Loss: 1.8095
> Epoch 19/30, Loss: 1.7514
> Epoch 20/30, Loss: 1.6743
> Epoch 21/30, Loss: 1.3091
> Epoch 22/30, Loss: 1.1632
> Epoch 23/30, Loss: 1.1070
> Epoch 24/30, Loss: 1.0480
> Epoch 25/30, Loss: 1.0080
> Epoch 26/30, Loss: 0.9674
> Epoch 27/30, Loss: 0.9331
> Epoch 28/30, Loss: 0.9021
> Epoch 29/30, Loss: 0.8742
> Epoch 30/30, Loss: 0.8463

Evaluating SCConv-ResNet50...
> Epoch 1/30, Loss: 4.8036
> Epoch 2/30, Loss: 4.1170
> Epoch 3/30, Loss: 3.8053
> Epoch 4/30, Loss: 3.614

# Contribution-0

### Setup and Imports

In [None]:
!nvidia-smi

Mon May 12 09:56:42 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.247.01             Driver Version: 535.247.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L40S                    Off | 00000000:34:00.0 Off |                    0 |
| N/A   32C    P8              33W / 350W |      0MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install ptflops tabulate fvcore



In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from torch.amp import autocast, GradScaler
import torch
import torch._dynamo
import torch._dynamo as dynamo

import time

In [None]:
torch._dynamo.config.capture_scalar_outputs = True

# Enabling cuDNN autotuner for optimized kernels
torch.backends.cudnn.benchmark = True

# Enabling TF32 on Ampere GPUs for faster FP32 matmuls
if hasattr(torch.backends.cuda, 'matmul_precision'):
    torch.set_float32_matmul_precision('high')

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

### Implementation

##### Model Architecture

In [None]:
def lean_scconv_kernel(x, weight_dw, weight_pw):
    # depthwise + pointwise fused via torch.compile
    x = F.conv2d(x, weight_dw, padding=1, groups=weight_dw.shape[0])

    return F.conv2d(x, weight_pw)

class SCConv(nn.Module):

    def __init__(self, channels):

        super().__init__()

        self.weight_dw = nn.Parameter(torch.empty(channels, 1, 3, 3))
        self.weight_pw = nn.Parameter(torch.empty(channels, channels, 1, 1))
        nn.init.kaiming_normal_(self.weight_dw, mode='fan_out', nonlinearity='relu')
        nn.init.kaiming_normal_(self.weight_pw, mode='fan_out', nonlinearity='relu')

    def forward(self, x):
        # ensure channels-last
        x = x.contiguous(memory_format=torch.channels_last)
        out = lean_scconv_kernel(x, self.weight_dw, self.weight_pw)

        return out.contiguous(memory_format=torch.contiguous_format)

In [None]:
def conv1x1(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, 1, stride=stride, bias=False)

In [None]:
class BottleneckSC(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, use_scconv=True):

        super().__init__()

        self.conv1 = nn.Conv2d(inplanes, planes, 1, stride=stride, bias=False)
        self.bn1   = nn.BatchNorm2d(planes)
        self.relu  = nn.ReLU(inplace=True)

        # use SCConv only in later stages, based on channel size
        self.sc    = SCConv(planes) if use_scconv else nn.Conv2d(planes, planes, 3, padding=1, groups=planes, bias=False)
        self.bn2   = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
        self.bn3   = nn.BatchNorm2d(planes * self.expansion)
        self.downsample = downsample

    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.sc(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample:
            identity = self.downsample(x)

        out += identity

        return self.relu(out)

In [None]:
class ResNetSC(nn.Module):

    def __init__(self, layers, num_classes=10, use_scconv=True):

        super().__init__()

        self.inplanes = 64
        self.conv1 = nn.Conv2d(3, 64, 7, 2, 3, bias=False)
        self.bn1   = nn.BatchNorm2d(64)
        self.relu  = nn.ReLU(inplace=True)
        self.maxp  = nn.MaxPool2d(3, 2, 1)

        # selectively disable SCConv in first block
        self.layer1 = self._make_layer(64, layers[0], use_scconv=False)
        self.layer2 = self._make_layer(128, layers[1], use_scconv=True, stride=2)
        self.layer3 = self._make_layer(256, layers[2], use_scconv=True, stride=2)
        self.layer4 = self._make_layer(512, layers[3], use_scconv=True, stride=2)
        self.avgp = nn.AdaptiveAvgPool2d(1)
        self.fc   = nn.Linear(512 * BottleneckSC.expansion, num_classes)

    def _make_layer(self, planes, blocks, use_scconv, stride=1):
        downsample = None

        if stride != 1 or self.inplanes != planes * BottleneckSC.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * BottleneckSC.expansion, 1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * BottleneckSC.expansion)
            )

        layers = []
        layers.append(BottleneckSC(self.inplanes, planes, stride, downsample, use_scconv))
        self.inplanes = planes * BottleneckSC.expansion

        for _ in range(1, blocks):
            layers.append(BottleneckSC(self.inplanes, planes, use_scconv=use_scconv))

        return nn.Sequential(*layers)

    def forward(self, x):
        # enforce channels-last at input
        x = x.to(memory_format=torch.channels_last)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxp(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgp(x)
        x = torch.flatten(x, 1)

        return self.fc(x)

### Model Training

In [None]:
def get_data_loaders_10():
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
    ])

    trainset = datasets.CIFAR10('./data', True, transform_train, download=True)
    testset  = datasets.CIFAR10('./data', False, transform_test, download=True)

    trainloader = DataLoader(
        trainset, 128, shuffle=True,
        num_workers=8, pin_memory=True,
        persistent_workers=True
    )

    testloader  = DataLoader(
        testset,  100, shuffle=False,
        num_workers=8, pin_memory=True,
        persistent_workers=True
    )

    return trainloader, testloader

def get_data_loaders_100():
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
    ])

    trainset = datasets.CIFAR100('./data', True, transform_train, download=True)
    testset  = datasets.CIFAR100('./data', False, transform_test, download=True)

    trainloader = DataLoader(
        trainset, 128, shuffle=True,
        num_workers=8, pin_memory=True,
        persistent_workers=True
    )

    testloader  = DataLoader(
        testset,  100, shuffle=False,
        num_workers=8, pin_memory=True,
        persistent_workers=True
    )

    return trainloader, testloader

In [None]:
def train_and_evaluate(dataset_cifar_100=False, use_scconv=False, epochs=60):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    if dataset_cifar_100:
        trainloader, testloader = get_data_loaders_100()

        # initialize and compile model
        if use_scconv:
            model = ResNetSC([3,4,6,3], num_classes=100, use_scconv=True)
        else:
            model = models.resnet50(num_classes=100)

        epochs = 30
    else:
        trainloader, testloader = get_data_loaders_10()

        # initialize and compile model
        if use_scconv:
            model = ResNetSC([3,4,6,3], num_classes=10, use_scconv=True)
        else:
            model = models.resnet50(num_classes=10)

    model = model.to(device)
    # model = model.to(memory_format=torch.channels_last)

    model = torch.compile(model)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
    # scheduler = optim.lr_scheduler.OneCycleLR(
    #     optimizer, max_lr=0.1,
    #     steps_per_epoch=len(trainloader), epochs=epochs, gamma=0.1
    # )
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

    scaler = GradScaler()

    for epoch in range(epochs):
        model.train()
        loss_accum = 0.0

        for imgs, lbls in trainloader:
            imgs = imgs.to(device=device, memory_format=torch.channels_last, non_blocking=True)
            lbls = lbls.to(device=device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)

            with autocast(device_type='cuda'):
                outputs = model(imgs)
                loss = criterion(outputs, lbls)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            loss_accum += loss.item()

        print(f"> Epoch {epoch+1}/{epochs}, Loss: {loss_accum/len(trainloader):.4f}")

    # evaluation
    model.eval()
    correct = total = 0

    with torch.no_grad():

        for imgs, lbls in testloader:
            imgs = imgs.to(device, non_blocking=True)
            lbls = lbls.to(device, non_blocking=True)

            with autocast(device_type='cuda'):
                preds = model(imgs).argmax(dim=1)

            correct += (preds == lbls).sum().item()
            total += lbls.size(0)

    return 100. * correct / total, model

### Driver Code

In [None]:
if __name__ == '__main__':

    from tabulate import tabulate

    print("Evaluating Baseline ResNet50...")
    start_base = time.time()
    base_acc, base_model = train_and_evaluate(dataset_cifar_100=False, use_scconv=False)
    end_base = time.time()

    print("\nEvaluating SCConv-ResNet50...")
    start_sc = time.time()
    scconv_acc, scconv_model = train_and_evaluate(dataset_cifar_100=False, use_scconv=True)
    end_sc = time.time()
    acc_gain = scconv_acc - base_acc
    time_diff = (end_sc - start_sc) - (end_base - start_base)
    time_ratio = (end_sc - start_sc) / (end_base - start_base)
    headers = ["Model", "Accuracy (%)", "Training Time (min)", "Params (M)", "FLOPs (G)"]

    print()

    from fvcore.nn import FlopCountAnalysis, parameter_count

    # def get_stats_fvcore(model, input_size=(1, 3, 32, 32)):
    #     model = model.eval().cpu()
    #     inputs = torch.randn(*input_size)
    #     flops = FlopCountAnalysis(model, inputs)
    #     params = parameter_count(model)

    #     return round(params[""], 2), round(flops.total() / 1e9, 2)

    # base_params, base_flops = get_stats_fvcore(base_model)
    # sc_params, sc_flops = get_stats_fvcore(scconv_model)

    def get_stats_fvcore_from_scratch(use_scconv=False, input_size=(1, 3, 32, 32)):

        if use_scconv:
            model = ResNetSC([3, 4, 6, 3], num_classes=10)
        else:
            model = models.resnet50(num_classes=10)

        model = model.eval().cpu()
        inputs = torch.randn(*input_size)
        flops = FlopCountAnalysis(model, inputs)
        params = parameter_count(model)

        return round(params[""], 2), round(flops.total() / 1e9, 2)

    base_params, base_flops = get_stats_fvcore_from_scratch(use_scconv=False)
    sc_params, sc_flops = get_stats_fvcore_from_scratch(use_scconv=True)

    rows = [
        ["ResNet-50 (Baseline)", f"{base_acc:.2f}", f"{(end_base-start_base)/60:.2f}", f"{base_params}", f"{base_flops}"],
        ["SCConv-ResNet-50", f"{scconv_acc:.2f}", f"{(end_sc-start_sc)/60:.2f}", f"{sc_params}", f"{sc_flops}"],
    ]

    print("\nSummary:\n")
    print(tabulate(rows, headers=headers, tablefmt="fancy_grid"))
    print("\nComparative Analysis:")
    print(f"Accuracy Gain:      {acc_gain:+.2f}%")
    print(f"Time Increase:      {time_diff:+.2f} sec ({time_ratio:.2f}× slower)")

Evaluating Baseline ResNet50...




> Epoch 1/60, Loss: 2.5179
> Epoch 2/60, Loss: 2.3780
> Epoch 3/60, Loss: 2.3698
> Epoch 4/60, Loss: 2.3769
> Epoch 5/60, Loss: 2.3660
> Epoch 6/60, Loss: 2.3681
> Epoch 7/60, Loss: 2.3751
> Epoch 8/60, Loss: 2.3744
> Epoch 9/60, Loss: 2.3699
> Epoch 10/60, Loss: 2.3800
> Epoch 11/60, Loss: 2.3746
> Epoch 12/60, Loss: 2.3791
> Epoch 13/60, Loss: 2.3681
> Epoch 14/60, Loss: 2.3737
> Epoch 15/60, Loss: 2.3782
> Epoch 16/60, Loss: 2.3748
> Epoch 17/60, Loss: 2.3785
> Epoch 18/60, Loss: 2.3804
> Epoch 19/60, Loss: 2.3698
> Epoch 20/60, Loss: 2.3785
> Epoch 21/60, Loss: 2.3760
> Epoch 22/60, Loss: 2.3765
> Epoch 23/60, Loss: 2.3758
> Epoch 24/60, Loss: 2.3799
> Epoch 25/60, Loss: 2.3680
> Epoch 26/60, Loss: 2.3740
> Epoch 27/60, Loss: 2.3659
> Epoch 28/60, Loss: 2.3800
> Epoch 29/60, Loss: 2.3735
> Epoch 30/60, Loss: 2.3779
> Epoch 31/60, Loss: 2.3793
> Epoch 32/60, Loss: 2.3803
> Epoch 33/60, Loss: 2.3765
> Epoch 34/60, Loss: 2.3777
> Epoch 35/60, Loss: 2.3705
> Epoch 36/60, Loss: 2.3721
>



> Epoch 1/60, Loss: 2.4098
> Epoch 2/60, Loss: 2.3188
> Epoch 3/60, Loss: 2.3120
> Epoch 4/60, Loss: 2.3077
> Epoch 5/60, Loss: 2.3124
> Epoch 6/60, Loss: 2.2995
> Epoch 7/60, Loss: 2.2961
> Epoch 8/60, Loss: 2.3126
> Epoch 9/60, Loss: 2.3217
> Epoch 10/60, Loss: 2.3117
> Epoch 11/60, Loss: 2.3110
> Epoch 12/60, Loss: 2.2995
> Epoch 13/60, Loss: 2.3014
> Epoch 14/60, Loss: 2.3135
> Epoch 15/60, Loss: 2.2900
> Epoch 16/60, Loss: 2.3030
> Epoch 17/60, Loss: 2.3023
> Epoch 18/60, Loss: 2.3220
> Epoch 19/60, Loss: 2.3051
> Epoch 20/60, Loss: 2.3066
> Epoch 21/60, Loss: 2.3188
> Epoch 22/60, Loss: 2.3160
> Epoch 23/60, Loss: 2.3124
> Epoch 24/60, Loss: 2.3198
> Epoch 25/60, Loss: 2.3084
> Epoch 26/60, Loss: 2.3094
> Epoch 27/60, Loss: 2.3060
> Epoch 28/60, Loss: 2.3095
> Epoch 29/60, Loss: 2.3054
> Epoch 30/60, Loss: 2.3062
> Epoch 31/60, Loss: 2.3114
> Epoch 32/60, Loss: 2.3061
> Epoch 33/60, Loss: 2.2920
> Epoch 34/60, Loss: 2.3140
> Epoch 35/60, Loss: 2.2983
> Epoch 36/60, Loss: 2.3078
>

Unsupported operator aten::max_pool2d encountered 1 time(s)
Unsupported operator aten::add_ encountered 16 time(s)
Unsupported operator aten::max_pool2d encountered 1 time(s)
Unsupported operator aten::add_ encountered 16 time(s)



Summary:

╒══════════════════════╤════════════════╤═══════════════════════╤══════════════╤═════════════╕
│ Model                │   Accuracy (%) │   Training Time (min) │   Params (M) │   FLOPs (G) │
╞══════════════════════╪════════════════╪═══════════════════════╪══════════════╪═════════════╡
│ ResNet-50 (Baseline) │          17.52 │                  9.01 │     23528522 │        0.08 │
├──────────────────────┼────────────────┼───────────────────────┼──────────────┼─────────────┤
│ SCConv-ResNet-50     │          20.84 │                  8.85 │     13490442 │        0.05 │
╘══════════════════════╧════════════════╧═══════════════════════╧══════════════╧═════════════╛

Comparative Analysis:
Accuracy Gain:      +3.32%
Time Increase:      -9.57 sec (0.98× slower)


In [None]:
if __name__ == '__main__':

    from tabulate import tabulate

    print("Evaluating Baseline ResNet50...")
    start_base = time.time()
    base_acc, base_model = train_and_evaluate(dataset_cifar_100=True, use_scconv=False)
    end_base = time.time()

    print("\nEvaluating SCConv-ResNet50...")
    start_sc = time.time()
    scconv_acc, scconv_model = train_and_evaluate(dataset_cifar_100=True, use_scconv=True)
    end_sc = time.time()
    acc_gain = scconv_acc - base_acc
    time_diff = (end_sc - start_sc) - (end_base - start_base)
    time_ratio = (end_sc - start_sc) / (end_base - start_base)
    headers = ["Model", "Accuracy (%)", "Training Time (min)", "Params (M)", "FLOPs (G)"]

    print()

    from fvcore.nn import FlopCountAnalysis, parameter_count

    # def get_stats_fvcore(model, input_size=(1, 3, 32, 32)):
    #     model = model.eval().cpu()
    #     inputs = torch.randn(*input_size)
    #     flops = FlopCountAnalysis(model, inputs)
    #     params = parameter_count(model)

    #     return round(params[""], 2), round(flops.total() / 1e9, 2)

    # base_params, base_flops = get_stats_fvcore(base_model)
    # sc_params, sc_flops = get_stats_fvcore(scconv_model)

    def get_stats_fvcore_from_scratch(use_scconv=False, input_size=(1, 3, 32, 32)):

        if use_scconv:
            model = ResNetSC([3, 4, 6, 3], num_classes=100)
        else:
            model = models.resnet50(num_classes=10)

        model = model.eval().cpu()
        inputs = torch.randn(*input_size)
        flops = FlopCountAnalysis(model, inputs)
        params = parameter_count(model)

        return round(params[""], 2), round(flops.total() / 1e9, 2)

    base_params, base_flops = get_stats_fvcore_from_scratch(use_scconv=False)
    sc_params, sc_flops = get_stats_fvcore_from_scratch(use_scconv=True)

    rows = [
        ["ResNet-50 (Baseline)", f"{base_acc:.2f}", f"{(end_base-start_base)/60:.2f}", f"{base_params}", f"{base_flops}"],
        ["SCConv-ResNet-50", f"{scconv_acc:.2f}", f"{(end_sc-start_sc)/60:.2f}", f"{sc_params}", f"{sc_flops}"],
    ]

    print("\nSummary:\n")
    print(tabulate(rows, headers=headers, tablefmt="fancy_grid"))
    print("\nComparative Analysis:")
    print(f"Accuracy Gain:      {acc_gain:+.2f}%")
    print(f"Time Increase:      {time_diff:+.2f} sec ({time_ratio:.2f}× slower)")

Evaluating Baseline ResNet50...




> Epoch 1/30, Loss: 4.7570
> Epoch 2/30, Loss: 4.7140
> Epoch 3/30, Loss: 4.7128
> Epoch 4/30, Loss: 4.7124
> Epoch 5/30, Loss: 4.7181
> Epoch 6/30, Loss: 4.7098
> Epoch 7/30, Loss: 4.7175
> Epoch 8/30, Loss: 4.7104
> Epoch 9/30, Loss: 4.7101
> Epoch 10/30, Loss: 4.7127
> Epoch 11/30, Loss: 4.7144
> Epoch 12/30, Loss: 4.7143
> Epoch 13/30, Loss: 4.7170
> Epoch 14/30, Loss: 4.7132
> Epoch 15/30, Loss: 4.7120
> Epoch 16/30, Loss: 4.7132
> Epoch 17/30, Loss: 4.7151
> Epoch 18/30, Loss: 4.7103
> Epoch 19/30, Loss: 4.7101
> Epoch 20/30, Loss: 4.7194
> Epoch 21/30, Loss: 4.7091
> Epoch 22/30, Loss: 4.7086
> Epoch 23/30, Loss: 4.7186
> Epoch 24/30, Loss: 4.7122
> Epoch 25/30, Loss: 4.7119
> Epoch 26/30, Loss: 4.7142
> Epoch 27/30, Loss: 4.7119
> Epoch 28/30, Loss: 4.7152
> Epoch 29/30, Loss: 4.7141
> Epoch 30/30, Loss: 4.7131

Evaluating SCConv-ResNet50...
> Epoch 1/30, Loss: 4.8543
> Epoch 2/30, Loss: 4.8073
> Epoch 3/30, Loss: 4.8005
> Epoch 4/30, Loss: 4.8073
> Epoch 5/30, Loss: 4.8044
> E

Unsupported operator aten::max_pool2d encountered 1 time(s)
Unsupported operator aten::add_ encountered 16 time(s)
Unsupported operator aten::max_pool2d encountered 1 time(s)
Unsupported operator aten::add_ encountered 16 time(s)



Summary:

╒══════════════════════╤════════════════╤═══════════════════════╤══════════════╤═════════════╕
│ Model                │   Accuracy (%) │   Training Time (min) │   Params (M) │   FLOPs (G) │
╞══════════════════════╪════════════════╪═══════════════════════╪══════════════╪═════════════╡
│ ResNet-50 (Baseline) │           2.26 │                  4.27 │     23528522 │        0.08 │
├──────────────────────┼────────────────┼───────────────────────┼──────────────┼─────────────┤
│ SCConv-ResNet-50     │           2.95 │                  4.19 │     13674852 │        0.05 │
╘══════════════════════╧════════════════╧═══════════════════════╧══════════════╧═════════════╛

Comparative Analysis:
Accuracy Gain:      +0.69%
Time Increase:      -4.95 sec (0.98× slower)


# Contribution-1

**Note:** This contribution is tied to the official CovNeXt V2 repository. Therefore, if you need to work with this, you first need to clone that repo. and then work the models, by embedding the SSConv plug and module.

### Setup and Imports

In [1]:
!nvidia-smi

Mon May 12 13:58:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip3 install timm fvcore

Collecting fvcore
  Downloading fvcore-0.1.5.post20221221.tar.gz (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m529.5 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.6 (from fvcore)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting iopath>=0.1.7 (from fvcore)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker (from iopath>=0.1.7->fvcore)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12

In [3]:
# !git clone https://github.com/facebookresearch/ConvNeXt-V2.git

# Note: Only once required

In [4]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# %cd ConvNeXt-V2
%cd /content/drive/MyDrive/ann_project/src/ConvNeXt-V2

/content/drive/MyDrive/ann_project/src/ConvNeXt-V2


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.amp import autocast, GradScaler
from torch.profiler import profile, ProfilerActivity
import torch, torch.nn as nn, torch.nn.functional as F

from fvcore.nn import FlopCountAnalysis, parameter_count
from tabulate import tabulate

from models.convnextv2 import ConvNeXtV2

import time



In [11]:
# Enabling cuDNN autotuner and TF32
torch.backends.cudnn.benchmark = True
if hasattr(torch.backends.cuda, 'matmul_precision'):
    torch.set_float32_matmul_precision('high')

In [12]:
# For, commenting-out unecessary code-blocks; however, the full commenting out of the classes are to be done manually, otherwise we get a syntax error.
utils_path = 'models/utils.py'

with open(utils_path, 'r') as f:
    lines = f.readlines()

with open(utils_path, 'w') as f:

    for line in lines:

        if 'Minkowski' in line:
            f.write(f"# {line}")  # comment out
        else:
            f.write(line)

### Implementation

##### For, testing

In [13]:
# For, testing
import importlib
import models.convnextv2
# import models.scconv
# importlib.reload(models.scconv)
importlib.reload(models.convnextv2)

<module 'models.convnextv2' from '/content/drive/MyDrive/ann_project/src/ConvNeXt-V2/models/convnextv2.py'>

In [14]:
# For, testing
from models.convnextv2 import ConvNeXtV2

model = ConvNeXtV2(depths=[1,1,1,1], dims=[32,64,128,256],
                   num_classes=10, use_scconv=True)
x = torch.randn(2, 3, 32, 32)
y = model(x)
print("Output:", y.shape)

Output: torch.Size([2, 10])


In [15]:
# For, testing
from models.convnextv2 import ConvNeXtV2

# Instantiate with and without SCConv
m_base = ConvNeXtV2(depths=[1,1,1,1], dims=[32,64,128,256],
                    num_classes=10, use_scconv=False)
m_sc   = ConvNeXtV2(depths=[1,1,1,1], dims=[32,64,128,256],
                    num_classes=10, use_scconv=True)

# Grab the very first Block in stage 0
blk_base = m_base.stages[0][0]
blk_sc   = m_sc.stages[0][0]

print("Baseline dwconv type:", type(blk_base.dwconv))
# Expect: <class 'torch.nn.modules.conv.Conv2d'>

print("SCConv dwconv type:", type(blk_sc.dwconv))
# Expect: <class 'models.scconv.SCConv'>

Baseline dwconv type: <class 'torch.nn.modules.conv.Conv2d'>
SCConv dwconv type: <class 'models.convnextv2.SCConv'>


In [16]:
# For, testing
def count_scconv(m):
    return sum(1 for module in m.modules() if module.__class__.__name__ == "SCConv")

print("SCConv count in baseline:", count_scconv(m_base))  # should be 0
print("SCConv count in patched model:", count_scconv(m_sc))  # should equal total blocks


SCConv count in baseline: 0
SCConv count in patched model: 4


In [17]:
# For, testing
import torchvision.transforms as T
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
import torch.optim as optim

# transforms
transform = T.Compose([T.RandomCrop(32,4), T.RandomHorizontalFlip(),
                       T.ToTensor(), T.Normalize((.5,)*3,(.5,)*3)])

trainset = CIFAR10('./data', True, transform, download=True)
trainloader = DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

# use SCConv model
model = ConvNeXtV2(depths=[2,2,2,2], dims=[64,128,256,512],
                   num_classes=10, use_scconv=True).cuda()
opt = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

# one training epoch
model.train()
for imgs, lbls in trainloader:
    imgs, lbls = imgs.cuda(), lbls.cuda()
    opt.zero_grad()
    logits = model(imgs)
    loss = criterion(logits, lbls)
    loss.backward()
    opt.step()
print("Done one epoch.")

Done one epoch.


### Model Training

In [22]:
def get_data_loaders(batch_size=256):
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4867, 0.4408),
                             (0.2675, 0.2565, 0.2761)),
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5071, 0.4867, 0.4408),
                             (0.2675, 0.2565, 0.2761)),
    ])

    trainset = datasets.CIFAR10('./data', True, transform_train, download=True)
    testset  = datasets.CIFAR10('./data', False, transform_test, download=True)

    loader_args = dict(batch_size=batch_size,
                       num_workers=2,
                       pin_memory=True,
                       persistent_workers=True)

    trainloader = DataLoader(trainset, shuffle=True, **loader_args)
    testloader  = DataLoader(testset, shuffle=False, **loader_args)

    return trainloader, testloader

In [24]:
def train_and_evaluate(use_scconv=False, epochs=5):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    trainloader, testloader = get_data_loaders(256)

    model = ConvNeXtV2(
        in_chans=3,
        num_classes=10,
        depths=[2,2,6,2],
        dims=[64,128,256,512],
        drop_path_rate=0.1,
        use_scconv=use_scconv
    ).to(device).to(memory_format=torch.channels_last)
    model = torch.compile(model)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1,
                          momentum=0.9, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=0.1,
        steps_per_epoch=len(trainloader), epochs=epochs
    )
    scaler = GradScaler()

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for inputs, labels in trainloader:
            inputs = inputs.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            labels = labels.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)

            with autocast(device_type='cuda'):
                outputs = model(inputs)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            scaler.step(optimizer)
            scaler.update()
            running_loss += loss.item()

        scheduler.step()

        print(f"> Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(trainloader):.4f}")

    # Profiling example on a single batch
    # inputs, labels = next(iter(testloader))
    # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
    #     model(inputs.to(device).to(memory_format=torch.channels_last))
    # print(prof.key_averages().table(sort_by="cuda_time_total"))

    model.eval()
    correct = total = 0

    with torch.no_grad():

        for inputs, labels in testloader:
            inputs = inputs.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            labels = labels.to(device, non_blocking=True)

            with autocast(device_type='cuda'):
                preds = model(inputs).argmax(dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return 100. * correct / total, model

In [28]:
def train_and_evaluate_model(model, epochs=5):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device).to(memory_format=torch.channels_last)
    model = torch.compile(model)

    trainloader, testloader = get_data_loaders(256)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1,
                          momentum=0.9, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=0.1,
        steps_per_epoch=len(trainloader), epochs=epochs
    )
    scaler = GradScaler()

    for epoch in range(epochs):
        model.train()
        for inputs, labels in trainloader:
            inputs = inputs.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            labels = labels.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)
            with autocast(device_type='cuda'):
                loss = criterion(model(inputs), labels)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            scaler.step(optimizer)
            scaler.update()
        scheduler.step()

    model.eval()
    correct = total = 0
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs = inputs.to(device).to(memory_format=torch.channels_last)
            labels = labels.to(device)
            with autocast(device_type='cuda'):
                preds = model(inputs).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return 100. * correct / total, model

### Driver Code

In [None]:
if __name__ == '__main__':
    from tabulate import tabulate

    print("Evaluating ConvNeXtV2 (Baseline)...")
    start_base = time.time()
    base_acc, base_model = train_and_evaluate(use_scconv=False)
    end_base = time.time()

    print("\nEvaluating ConvNeXtV2 + SCConv...")
    start_sc = time.time()
    sc_acc, sc_model = train_and_evaluate(use_scconv=True)
    end_sc = time.time()

    print()

    # Training times
    base_time = (end_base - start_base)
    sc_time   = (end_sc   - start_sc)
    base_time_min = base_time / 60
    sc_time_min   = sc_time / 60

    # Accuracy gain and timing stats
    acc_gain  = sc_acc - base_acc
    time_diff = sc_time - base_time
    time_ratio = sc_time / base_time if base_time != 0 else float('inf')

    from fvcore.nn import FlopCountAnalysis, parameter_count
    import torch

    def stats(model):
        model.eval()
        if hasattr(model, "_orig_mod"):  # for torch.compile() models
            model = model._orig_mod
        dummy = torch.randn(1, 3, 32, 32).to(next(model.parameters()).device)
        with torch.no_grad():
            flops = FlopCountAnalysis(model.cpu(), dummy.cpu())
            params = parameter_count(model.cpu())
        return round(params[""], 2), round(flops.total() / 1e9, 2)

    base_params, base_flops = stats(base_model)
    sc_params, sc_flops     = stats(sc_model)

    rows = [
        ["ConvNeXtV2 (Baseline)", f"{base_acc:.2f}", f"{base_time_min:.2f}", base_params, base_flops],
        ["ConvNeXtV2 + SCConv",   f"{sc_acc:.2f}",   f"{sc_time_min:.2f}",   sc_params,   sc_flops],
    ]

    headers = ["Model", "Accuracy (%)", "Training Time (min)", "Params (M)", "FLOPs (G)"]

    print("\nSummary:\n")
    print(tabulate(rows, headers=headers, tablefmt="fancy_grid"))

    print("\nComparative Analysis:")
    print(f"Accuracy Gain:      {acc_gain:+.2f}%")
    print(f"Time Increase:      {time_diff:+.2f} sec ({time_ratio:.2f}× slower)")


Evaluating ConvNeXtV2 (Baseline)...


> Epoch 1/5, Loss: 1.8602
> Epoch 2/5, Loss: 1.6779
> Epoch 3/5, Loss: 1.6130
> Epoch 4/5, Loss: 1.5772
> Epoch 5/5, Loss: 1.5450

Evaluating ConvNeXtV2 + SCConv...
> Epoch 1/5, Loss: 1.8789
> Epoch 2/5, Loss: 1.7020
> Epoch 3/5, Loss: 1.6344
> Epoch 4/5, Loss: 1.5916
> Epoch 5/5, Loss: 1.5447


Unsupported operator aten::mean encountered 21 time(s)
Unsupported operator aten::sub encountered 8 time(s)
Unsupported operator aten::pow encountered 4 time(s)
Unsupported operator aten::add encountered 56 time(s)
Unsupported operator aten::sqrt encountered 4 time(s)
Unsupported operator aten::div encountered 16 time(s)
Unsupported operator aten::mul encountered 28 time(s)
Unsupported operator aten::gelu encountered 12 time(s)
Unsupported operator aten::linalg_vector_norm encountered 12 time(s)
The following submodules of the model were never called during the trace of the graph. They may be unused, or they were accessed by direct calls to .forward() or via other python methods. In the latter case they will have zeros for statistics, though their statistics will still contribute to their parent calling module.
stages.0.1.drop_path, stages.1.0.drop_path, stages.1.1.drop_path, stages.2.0.drop_path, stages.2.1.drop_path, stages.2.2.drop_path, stages.2.3.drop_path, stages.2.4.drop_path, s




Unsupported operator aten::mean encountered 21 time(s)
Unsupported operator aten::sub encountered 8 time(s)
Unsupported operator aten::pow encountered 4 time(s)
Unsupported operator aten::add encountered 56 time(s)
Unsupported operator aten::sqrt encountered 4 time(s)
Unsupported operator aten::div encountered 16 time(s)
Unsupported operator aten::mul encountered 28 time(s)
Unsupported operator aten::gelu encountered 12 time(s)
Unsupported operator aten::linalg_vector_norm encountered 12 time(s)
The following submodules of the model were never called during the trace of the graph. They may be unused, or they were accessed by direct calls to .forward() or via other python methods. In the latter case they will have zeros for statistics, though their statistics will still contribute to their parent calling module.
stages.0.1.drop_path, stages.1.0.drop_path, stages.1.1.drop_path, stages.2.0.drop_path, stages.2.1.drop_path, stages.2.2.drop_path, stages.2.3.drop_path, stages.2.4.drop_path, s


Summary:

╒═══════════════════════╤════════════════╤═══════════════════════╤══════════════╤═════════════╕
│ Model                 │   Accuracy (%) │   Training Time (min) │   Params (M) │   FLOPs (G) │
╞═══════════════════════╪════════════════╪═══════════════════════╪══════════════╪═════════════╡
│ ConvNeXtV2 (Baseline) │          47.46 │                  1.16 │      8554506 │        0.03 │
├───────────────────────┼────────────────┼───────────────────────┼──────────────┼─────────────┤
│ ConvNeXtV2 + SCConv   │          47.2  │                  0.91 │      5543690 │        0.02 │
╘═══════════════════════╧════════════════╧═══════════════════════╧══════════════╧═════════════╛

Comparative Analysis:
Accuracy Gain:      -0.26%
Time Increase:      -14.66 sec (0.79× slower)


In [29]:
if __name__ == '__main__':
    from tabulate import tabulate

    from models.convnextv2 import convnextv2_atto, convnextv2_femto, convnext_pico, convnextv2_nano, convnextv2_tiny, convnextv2_base, convnextv2_large

    model_variants = {
        "atto": convnextv2_atto,
        "femto":convnextv2_femto,
        "pico": convnext_pico,
        "nano": convnextv2_nano,
        "tiny": convnextv2_tiny,
        "base": convnextv2_base,
        "large":convnextv2_large
    }

    results = []

    def stats(model):
        model.eval()
        if hasattr(model, "_orig_mod"):  # for torch.compile() models
            model = model._orig_mod
        dummy = torch.randn(1, 3, 32, 32).to(next(model.parameters()).device)
        with torch.no_grad():
            flops = FlopCountAnalysis(model.cpu(), dummy.cpu())
            params = parameter_count(model.cpu())
        return round(params[""], 2), round(flops.total() / 1e9, 2)

    for name, model_fn in model_variants.items():
        print(f"\nEvaluating ConvNeXtV2-{name} (Baseline)...")
        start_base = time.time()
        base_model = model_fn(in_chans=3, num_classes=10, use_scconv=False)
        base_acc, _ = train_and_evaluate_model(base_model)
        end_base = time.time()

        print(f"\nEvaluating ConvNeXtV2-{name} + SCConv...")
        start_sc = time.time()
        sc_model = model_fn(in_chans=3, num_classes=10, use_scconv=True)
        sc_acc, _ = train_and_evaluate_model(sc_model)
        end_sc = time.time()

        base_time = end_base - start_base
        sc_time = end_sc - start_sc

        base_params, base_flops = stats(base_model)
        sc_params, sc_flops     = stats(sc_model)

        results.append([
            f"{name} (Baseline)", f"{base_acc:.2f}", f"{base_time/60:.2f}", base_params, base_flops
        ])
        results.append([
            f"{name} + SCConv", f"{sc_acc:.2f}", f"{sc_time/60:.2f}", sc_params, sc_flops
        ])

    headers = ["Model", "Accuracy (%)", "Training Time (min)", "Params (M)", "FLOPs (G)"]
    print("\nSummary:\n")
    print(tabulate(results, headers=headers, tablefmt="fancy_grid"))



Evaluating ConvNeXtV2-atto (Baseline)...

Evaluating ConvNeXtV2-atto + SCConv...





Evaluating ConvNeXtV2-femto (Baseline)...

Evaluating ConvNeXtV2-femto + SCConv...


W0512 15:16:01.330000 1003 torch/_dynamo/convert_frame.py:906] [0/8] torch._dynamo hit config.cache_size_limit (8)
W0512 15:16:01.330000 1003 torch/_dynamo/convert_frame.py:906] [0/8]    function: 'forward' (/content/drive/MyDrive/ann_project/src/ConvNeXt-V2/models/convnextv2.py:547)
W0512 15:16:01.330000 1003 torch/_dynamo/convert_frame.py:906] [0/8]    last reason: 0/0: GLOBAL_STATE changed: grad_mode 
W0512 15:16:01.330000 1003 torch/_dynamo/convert_frame.py:906] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0512 15:16:01.330000 1003 torch/_dynamo/convert_frame.py:906] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.



Evaluating ConvNeXtV2-pico (Baseline)...

Evaluating ConvNeXtV2-pico + SCConv...





Evaluating ConvNeXtV2-nano (Baseline)...

Evaluating ConvNeXtV2-nano + SCConv...





Evaluating ConvNeXtV2-tiny (Baseline)...

Evaluating ConvNeXtV2-tiny + SCConv...





Evaluating ConvNeXtV2-base (Baseline)...

Evaluating ConvNeXtV2-base + SCConv...





Evaluating ConvNeXtV2-large (Baseline)...

Evaluating ConvNeXtV2-large + SCConv...





Summary:

╒══════════════════╤════════════════╤═══════════════════════╤══════════════╤═════════════╕
│ Model            │   Accuracy (%) │   Training Time (min) │   Params (M) │   FLOPs (G) │
╞══════════════════╪════════════════╪═══════════════════════╪══════════════╪═════════════╡
│ atto (Baseline)  │          47.58 │                  2.05 │      3388170 │        0.01 │
├──────────────────┼────────────────┼───────────────────────┼──────────────┼─────────────┤
│ atto + SCConv    │          45.65 │                  2.16 │      2180330 │        0.01 │
├──────────────────┼────────────────┼───────────────────────┼──────────────┼─────────────┤
│ femto (Baseline) │          47.13 │                  2.61 │      4849162 │        0.02 │
├──────────────────┼────────────────┼───────────────────────┼──────────────┼─────────────┤
│ femto + SCConv   │          46.9  │                  3.37 │      3130186 │        0.01 │
├──────────────────┼────────────────┼───────────────────────┼──────────────┼───