Question 1

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import time

from torchvision.models import resnet18

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Number of GPUs =", torch.cuda.device_count())

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)

Number of GPUs = 4


In [None]:
def train(trainloader, net, criterion, optimizer):
    net.train()
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

In [None]:
def measure_training_time(batch_size):
    try:
        trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
        net = resnet18(num_classes=10).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

        # Warm-up
        train(trainloader, net, criterion, optimizer)

        # Timed run
        torch.cuda.synchronize()
        start_time = time.time()
        train(trainloader, net, criterion, optimizer)
        torch.cuda.synchronize()
        end_time = time.time()

        training_time = end_time - start_time
        print(f"Batch Size: {batch_size}, Training Time: {training_time:.2f} seconds")
        return training_time
    except RuntimeError as e:
        if 'out of memory' in str(e).lower():
            print(f"Out of memory for batch size: {batch_size}")
            return None
        else:
            raise e

In [None]:
batch_size = 32
times = []

while True:
    training_time = measure_training_time(batch_size)
    if training_time is not None:
        times.append((batch_size, training_time))
        batch_size *= 4
    else:
        break

Batch Size: 32, Training Time: 15.51 seconds
Batch Size: 128, Training Time: 5.99 seconds
Batch Size: 512, Training Time: 5.82 seconds
Batch Size: 2048, Training Time: 5.81 seconds
Batch Size: 8192, Training Time: 6.19 seconds
Batch Size: 32768, Training Time: 9.00 seconds
Batch Size: 131072, Training Time: 13.14 seconds
Batch Size: 524288, Training Time: 13.02 seconds
Batch Size: 2097152, Training Time: 12.90 seconds
Batch Size: 8388608, Training Time: 12.97 seconds
Batch Size: 33554432, Training Time: 13.03 seconds
Batch Size: 134217728, Training Time: 12.90 seconds
Batch Size: 536870912, Training Time: 13.16 seconds
Batch Size: 2147483648, Training Time: 13.01 seconds
Batch Size: 8589934592, Training Time: 13.21 seconds
Batch Size: 34359738368, Training Time: 13.09 seconds
Batch Size: 137438953472, Training Time: 13.16 seconds
Batch Size: 549755813888, Training Time: 13.19 seconds
Batch Size: 2199023255552, Training Time: 13.20 seconds
Batch Size: 8796093022208, Training Time: 13.17

KeyboardInterrupt: 

To evaluate how training time varies with batch size on a single GPU, I measured the time it takes to complete one timed epoch for progressively increasing batch sizes. I started with a batch size of 32 and increased it by a factor of 4 in each iteration (32, 128, 512, etc.) until the GPU could no longer handle the memory requirements. Each run included two epochs: the first to warm up the cache, and the second for timing. The timing reflects the full training process for one epoch, including data movement to the GPU, gradient computation, and weight updates (but excluding data I/O).

In my experiment, the code was executed on a machine with 4 GPUs, but I explicitly configured it to use only a single GPU (cuda:0) to adhere to the instructions. As a result, memory was never exceeded, even for extremely large batch sizes. Since the loop began taking progressively longer to complete at those sizes, I manually interrupted it once the timing values stabilized and were no longer providing additional insight. Training time decreased initially (from batch sizes 32 to 512), then plateaued around sizes like 2048 and 8192, and eventually began increasing again at much larger sizes. This suggests that very small batch sizes underutilize the GPU, while extremely large ones hit diminishing returns and may slow things down. The optimal performance in my case appeared to occur around batch sizes 512 to 2048, where training was fast without incurring additional memory or compute overhead.

Question 2

In [None]:
from torch.nn import DataParallel

In [None]:
def train_one_epoch(trainloader, net, criterion, optimizer):
    net.train()
    for inputs, targets in trainloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

In [None]:
def measure_training_time(batch_size_per_gpu, num_gpus):
    global device
    total_batch_size = batch_size_per_gpu * num_gpus
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=total_batch_size,
                                              shuffle=True, num_workers=0)

    device_ids = list(range(num_gpus))
    net = resnet18(num_classes=10)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = net.to(device)

    if num_gpus > 1:
        net = DataParallel(net, device_ids=device_ids)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

    train_one_epoch(trainloader, net, criterion, optimizer)

    torch.cuda.synchronize()
    start_time = time.time()
    train_one_epoch(trainloader, net, criterion, optimizer)
    torch.cuda.synchronize()
    end_time = time.time()

    return end_time - start_time

In [None]:
batch_sizes = [32, 128, 512]
gpu_counts = [1, 2, 4]

results = {}
for bs in batch_sizes:
    results[bs] = {}
    for gpus in gpu_counts:
        time_taken = measure_training_time(bs, gpus)
        results[bs][gpus] = time_taken
        print(f"Batch Size per GPU: {bs}, GPUs: {gpus}, Time: {time_taken:.2f} seconds")

Batch Size per GPU: 32, GPUs: 1, Time: 25.86 seconds
Batch Size per GPU: 32, GPUs: 2, Time: 36.93 seconds
Batch Size per GPU: 32, GPUs: 4, Time: 27.25 seconds
Batch Size per GPU: 128, GPUs: 1, Time: 14.73 seconds
Batch Size per GPU: 128, GPUs: 2, Time: 17.59 seconds
Batch Size per GPU: 128, GPUs: 4, Time: 14.82 seconds
Batch Size per GPU: 512, GPUs: 1, Time: 11.43 seconds
Batch Size per GPU: 512, GPUs: 2, Time: 12.41 seconds
Batch Size per GPU: 512, GPUs: 4, Time: 11.78 seconds


Below is a table that records the training time and speedup for different batch sizes up to 4 GPUs:

<table>
  <thead>
    <tr>
      <th rowspan="2"> </th>
      <th colspan="2">Batch-size 32 per GPU</th>
      <th colspan="2">Batch-size 128 per GPU</th>
      <th colspan="2">Batch-size 512 per GPU</th>
    </tr>
    <tr>
      <th>Time (sec)</th>
      <th>Speedup</th>
      <th>Time (sec)</th>
      <th>Speedup</th>
      <th>Time (sec)</th>
      <th>Speedup</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>1-GPU</td>
      <td>25.86</td>
      <td>1.00</td>
      <td>14.73</td>
      <td>1.00</td>
      <td>11.43</td>
      <td>1.00</td>
    </tr>
    <tr>
      <td>2-GPU</td>
      <td>36.93</td>
      <td>1.40</td>
      <td>17.59</td>
      <td>1.67</td>
      <td>12.41</td>
      <td>1.84</td>
    </tr>
    <tr>
      <td>4-GPU</td>
      <td>27.25</td>
      <td>3.80</td>
      <td>14.82</td>
      <td>3.97</td>
      <td>11.78</td>
      <td>3.88</td>
    </tr>
  </tbody>
</table>


Speedup calculation:

For 2 GPUs: Speedup = (2 * Time for 1 GPU) / (Time for 2 GPUs)
For 4 GPUs: Speedup = (4 * Time for 1 GPU) / (Time for 4 GPUs)

This experiment is an example of weak scaling, since we kept the batch size per GPU constant (e.g., 32, 128, etc.) while increasing the number of GPUs. As a result, the total batch size grew with the number of GPUs, and each GPU processed the same amount of data. This setup allows us to observe how well the system handles increased workloads when more GPUs are added.

If we had used strong scaling instead, where the total batch size stays constant regardless of the number of GPUs, each GPU would have processed less data as more were added. In that case, we would expect to see even faster training times due to the reduced workload per GPU. However, strong scaling typically runs into diminishing returns, especially when communication overhead between GPUs starts to outweigh the benefits of parallelism.

### Question 3

<table>
  <thead>
    <tr>
      <th rowspan="2"> </th>
      <th colspan="2">Batch-size 32 per GPU</th>
      <th colspan="2">Batch-size 128 per GPU</th>
      <th colspan="2">Batch-size 512 per GPU</th>
    </tr>
    <tr>
      <th>Compute (sec)</th>
      <th>Comm (sec)</th>
      <th>Compute (sec)</th>
      <th>Comm (sec)</th>
      <th>Compute (sec)</th>
      <th>Comm (sec)</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>2-GPU</td>
      <td>12.93</td>
      <td>24.00</td>
      <td>7.37</td>
      <td>10.22</td>
      <td>5.72</td>
      <td>6.69</td>
    </tr>
    <tr>
      <td>4-GPU</td>
      <td>6.47</td>
      <td>20.78</td>
      <td>3.68</td>
      <td>11.14</td>
      <td>2.86</td>
      <td>8.92</td>
    </tr>
  </tbody>
</table>


Since we have training times for the 1-GPU, 2-GPU, and 4-GPU configurations, we can estimate the compute and communication times for the multi-GPU setups as follows: \\

* We approximate the compute time for the multi-GPU runs by dividing the 1-GPU training time by the number of GPUs used (i.e., by 2 for 2-GPU and by 4 for 4-GPU setups), assuming ideal scaling with no communication overhead. This is based on the idea that a single GPU does not incur any inter-device communication.

* The communication time is then estimated as the difference between the total training time of the multi-GPU setup and its corresponding estimated compute time (i.e., 1-GPU time ÷ number of GPUs).


<br>

**2-GPU Calculations:**

* **Batch size 32:** Compute time = 1-GPU time for batch size 32, which is 25.86 seconds / 2 = 12.93. Communication time would be 36.93 (total time for 2-GPU) - 12.93 = 24.00 sec. \\


* **Batch size 128:** Compute time is 7.37 seconds. Communication time would be 17.59 - 7.37 = 10.22 sec. \\

* **Batch size 512:** Compute time is 5.72 seconds. Communication time would be 12.41 - 5.72 = 6.69 sec. \\

**4-GPU Calculations:**

* **Batch size 32:** Compute time = 1-GPU time for batch size 32, which is 25.86 seconds / 4 = 6.47 sec. Communication time would be 27.25 - 6.47 = 20.78 sec. \\

* **Batch size 128:** Compute time is 3.68 sec. Communication time would be 14.82 - 3.68 = 11.14 sec. \\

* **Batch size 512:** Compute time is 2.86 sec. Communication time would be 11.78 - 2.86 = 8.92 sec. \\

### Question 4

The formula for allreaduce is: $2(N-1)(\frac{K}{N})$,

where K is the number of model parameters, and N is the number of GPUs.

<br>

The formula for bandwidth utilization is: Allreduce cost / Communication time

<br>

First, we calculate the Allreduce cost for when the number of GPUs is equal to 2 and 4: \\

We know that the number of parameters in ResNet18 is K = 11,689,512 \\

* For 2 GPUs: Allreduce cost = $2(2-1)\frac{11,689,512}{2} = 11,689,512$

* For 4 GPUs: Allreduce cost = $2(4-1)\frac{11,689,512}{4} = 17,534,268$ \\

Since we want our final r4esults to be in units of (GB/s), we convert the above values to GB: //

* For 2 GPUs: Allreduce cost = $\frac{11,689,512 * 4 bytes}{2^{30}} = 0.044 GB$

* For 4 GPUs: Allreduce cost = $\frac{17,534,268 * 4 bytes}{2^{30}} = 0.065 GB$

** 1 GB = $20^{30}$ bytes*

<br>

To get the Bandwith Utilization, we divide the above values for Allreduce costs with the corresponding communication time from part 3 for each batch size: \\



<table>
<thead>
<tr>
<th></th>
<th>Batch-size-per-GPU 32<br><small>Bandwidth Utilization (GB/s)</small></th>
<th>Batch-size-per-GPU 128<br><small>Bandwidth Utilization (GB/s)</small></th>
<th>Batch-size-per-GPU 512<br><small>Bandwidth Utilization (GB/s)</small></th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>2-GPU</strong></td>
<td>0.0018</td>
<td>0.0043</td>
<td>0.0066</td>
</tr>
<tr>
<td><strong>4-GPU</strong></td>
<td>0.0031</td>
<td>0.0058</td>
<td>0.0073</td>
</tr>
</tbody>
</table>
