## HW2 Probelm3

In [1]:
import time
import torch
import torchvision
import torchvision.transforms as T
from torch import nn, optim
from torch.nn import CrossEntropyLoss
from torchvision.models import resnet18

In [2]:
#ResNet from https://github.com/kuangliu/pytorch-cifar/blob/master/models/resnet.py

'''ResNet in PyTorch.

For Pre-activation ResNet, see 'preact_resnet.py'.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])

In [3]:
!mkdir -p models


In [4]:
#config
gpu_counts = [1]
batch_sizes = [16, 32, 128, 512]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
transform = T.Compose([
    T.RandomCrop(32, padding=4),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
train_ds = torchvision.datasets.CIFAR10('./data', train=True, download=True, transform=transform)

results = {}

In [6]:
for gpus in gpu_counts:
    print(f"\n=== Running on {gpus} GPU(s) ===")
    for bs in batch_sizes:
        try:
            train_loader = torch.utils.data.DataLoader(
                train_ds, batch_size=bs, shuffle=True, num_workers=2
            )
            net = ResNet18().to(device)
            if gpus > 1:
                net = nn.DataParallel(net)  # will leverage available GPUs
            optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
            criterion = nn.CrossEntropyLoss()

            # Warm-up epoch
            net.train()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                criterion(net(xb), yb).backward()
                optimizer.step()

            # Timed epoch
            torch.cuda.synchronize()
            start = time.perf_counter()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                criterion(net(xb), yb).backward()
                optimizer.step()
            torch.cuda.synchronize()
            elapsed = time.perf_counter() - start

            results[(gpus, bs)] = elapsed
            print(f"Batch={bs:<4} → {elapsed:6.2f} sec")
        except RuntimeError as e:
            results[(gpus, bs)] = None
            print(f"Batch={bs:<4} → OOM or Error: {str(e).splitlines()[0]}")

# Display summary table
print("\nSummary Results:")
print(f"{'GPUs':>4} | {'Batch':>5} | {'Time (s)':>8}")
print("-" * 26)
for (gpus, bs), t in results.items():
    time_str = f"{t:.2f}" if t is not None else "OOM"
    print(f"{gpus:>4} | {bs:>5} | {time_str:>8}")



=== Running on 1 GPU(s) ===
Batch=16   →  34.83 sec
Batch=32   →  20.10 sec
Batch=128  →  17.70 sec
Batch=512  →  19.91 sec

Summary Results:
GPUs | Batch | Time (s)
--------------------------
   1 |    16 |    34.83
   1 |    32 |    20.10
   1 |   128 |    17.70
   1 |   512 |    19.91


## 2 GPU

In [2]:
import torch
import torchvision
import torchvision.transforms as T
import torch.nn as nn
import torch.optim as optim
import time
from models.resnet import ResNet18

In [3]:
#ResNet from https://github.com/kuangliu/pytorch-cifar/blob/master/models/resnet.py

'''ResNet in PyTorch.

For Pre-activation ResNet, see 'preact_resnet.py'.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])

In [4]:
# Configuration
gpu_counts = [2]
batch_sizes = [16, 32, 128, 512]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare dataset once
transform = T.Compose([
    T.RandomCrop(32, padding=4),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
train_ds = torchvision.datasets.CIFAR10('./data', train=True, download=True, transform=transform)

results = {}

In [5]:
for gpus in gpu_counts:
    print(f"\n=== Running on {gpus} GPU(s) ===")
    for bs in batch_sizes:
        try:
            train_loader = torch.utils.data.DataLoader(
                train_ds, batch_size=bs, shuffle=True, num_workers=2
            )
            net = ResNet18().to(device)
            if gpus > 1:
                net = nn.DataParallel(net)  # will leverage available GPUs
            optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
            criterion = nn.CrossEntropyLoss()

            # Warm-up epoch
            net.train()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                criterion(net(xb), yb).backward()
                optimizer.step()

            # Timed epoch
            torch.cuda.synchronize()
            start = time.perf_counter()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                criterion(net(xb), yb).backward()
                optimizer.step()
            torch.cuda.synchronize()
            elapsed = time.perf_counter() - start

            results[(gpus, bs)] = elapsed
            print(f"Batch={bs:<4} → {elapsed:6.2f} sec")
        except RuntimeError as e:
            results[(gpus, bs)] = None
            print(f"Batch={bs:<4} → OOM or Error: {str(e).splitlines()[0]}")


=== Running on 2 GPU(s) ===
Batch=16   → 171.60 sec
Batch=32   →  87.07 sec
Batch=128  →  22.66 sec
Batch=512  →  10.12 sec


In [6]:
# Display summary table
print("\nSummary Results:")
print(f"{'GPUs':>4} | {'Batch':>5} | {'Time (s)':>8}")
print("-" * 26)
for (gpus, bs), t in results.items():
    time_str = f"{t:.2f}" if t is not None else "OOM"
    print(f"{gpus:>4} | {bs:>5} | {time_str:>8}")


Summary Results:
GPUs | Batch | Time (s)
--------------------------
   2 |    16 |   171.60
   2 |    32 |    87.07
   2 |   128 |    22.66
   2 |   512 |    10.12


## 4GPU

In [7]:
import torch
import torchvision
import torchvision.transforms as T
import torch.nn as nn
import torch.optim as optim
import time

In [8]:
#ResNet from https://github.com/kuangliu/pytorch-cifar/blob/master/models/resnet.py

'''ResNet in PyTorch.

For Pre-activation ResNet, see 'preact_resnet.py'.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])

# Configuration
gpu_counts = [4]
batch_sizes = [16, 32, 128, 512]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare dataset once
transform = T.Compose([
    T.RandomCrop(32, padding=4),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
train_ds = torchvision.datasets.CIFAR10('./data', train=True, download=True, transform=transform)

results = {}

for gpus in gpu_counts:
    print(f"\n=== Running on {gpus} GPU(s) ===")
    for bs in batch_sizes:
        try:
            train_loader = torch.utils.data.DataLoader(
                train_ds, batch_size=bs, shuffle=True, num_workers=2
            )
            net = ResNet18().to(device)
            if gpus > 1:
                net = nn.DataParallel(net)  # will leverage available GPUs
            optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
            criterion = nn.CrossEntropyLoss()

            # Warm-up epoch
            net.train()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                criterion(net(xb), yb).backward()
                optimizer.step()

            # Timed epoch
            torch.cuda.synchronize()
            start = time.perf_counter()
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                criterion(net(xb), yb).backward()
                optimizer.step()
            torch.cuda.synchronize()
            elapsed = time.perf_counter() - start

            results[(gpus, bs)] = elapsed
            print(f"Batch={bs:<4} → {elapsed:6.2f} sec")
        except RuntimeError as e:
            results[(gpus, bs)] = None
            print(f"Batch={bs:<4} → OOM or Error: {str(e).splitlines()[0]}")

# Display summary table
print("\nSummary Results:")
print(f"{'GPUs':>4} | {'Batch':>5} | {'Time (s)':>8}")
print("-" * 26)
for (gpus, bs), t in results.items():
    time_str = f"{t:.2f}" if t is not None else "OOM"
    print(f"{gpus:>4} | {bs:>5} | {time_str:>8}")


=== Running on 4 GPU(s) ===
Batch=16   → 172.49 sec
Batch=32   →  91.34 sec
Batch=128  →  22.02 sec
Batch=512  →  10.13 sec

Summary Results:
GPUs | Batch | Time (s)
--------------------------
   4 |    16 |   172.49
   4 |    32 |    91.34
   4 |   128 |    22.02
   4 |   512 |    10.13


## Summary


Table1:
| Batch | T₁ (s) | T₂ (s) | S₂ = T₁/T₂ | T₄ (s) | S₄ = T₁/T₄ |
| ----- | ------ | ------ | ---------- | ------ | ---------- |
| 16    | 34.83  | 171.60 | 0.20       | 172.49 | 0.20       |
| 32    | 20.10  | 87.07  | 0.23       | 91.34  | 0.22       |
| 128   | 17.70  | 22.66  | 0.78       | 22.02  | 0.80       |
| 512   | 19.91  | 10.12  | 1.97       | 10.13  | 1.97       |


table2:
|       | Batch-size 16 per GPU | Batch-size 32 per GPU | Batch-size 128 per GPU | Batch-size 512 per GPU |         |      |         |             |
| ----- | --------------------- | --------------------- | ---------------------- | ---------------------- | ------- | ---- | ------- | ----------- |
|       | Compute               | Comm                  | Compute                | Comm                   | Compute | Comm | Compute | Comm        |
| 2-GPU | 34.83                 | 136.77                | 20.10                  | 66.97                  | 17.70   | 4.96 | 19.91   | 0 (clipped) |
| 4-GPU | 34.83                 | 137.66                | 20.10                  | 71.24                  | 17.70   | 4.32 | 19.91   | 0 (clipped) |




table 3:
Bandwidth Utilization (GB/s)= 
Communication time (s) /
Data size to communicate (GB)
​


In [10]:
# Count total model parameters (float32, 4 bytes each)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

num_params = count_parameters(ResNet18())
data_per_sync_gb = num_params * 4 / 1e9  # GB per sync
print(f"ResNet-18 parameters: {num_params} ({data_per_sync_gb:.4f} GB per sync)")


ResNet-18 parameters: 11173962 (0.0447 GB per sync)


In [11]:
import math

# Number of training samples in CIFAR-10
num_train_samples = len(train_ds)

print("\n=== Bandwidth Utilization Summary ===")
print(f"{'GPUs':>4} | {'Batch':>5} | {'Comm(s)':>8} | {'BW(GB/s)':>9}")
print("-" * 36)

for bs in batch_sizes:
    t1 = results.get((1, bs))
    num_batches = math.ceil(num_train_samples / bs)
    total_data_gb = data_per_sync_gb * num_batches

    for gpus in gpu_counts:
        if gpus == 1:
            continue  # no communication for single GPU
        tN = results.get((gpus, bs))
        if t1 is None or tN is None:
            comm_time = bw = None
        else:
            comm_time = max(0, tN - t1)  # max to avoid negative
            bw = total_data_gb / comm_time if comm_time > 0 else None
        comm_str = f"{comm_time:.2f}" if comm_time is not None else "N/A"
        bw_str = f"{bw:.3f}" if bw is not None else "N/A"
        print(f"{gpus:>4} | {bs:>5} | {comm_str:>8} | {bw_str:>9}")



=== Bandwidth Utilization Summary ===
GPUs | Batch |  Comm(s) |  BW(GB/s)
------------------------------------
   4 |    16 |      N/A |       N/A
   4 |    32 |      N/A |       N/A
   4 |   128 |      N/A |       N/A
   4 |   512 |      N/A |       N/A
