In [None]:
import torch
import time

import torch.nn as nn
import torch.optim as optim

print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())  

2.5.1+cu124
12.4
90100


# Hardware Info

In [4]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())
print(torch.cuda.empty_cache())

True
1
NVIDIA GeForce RTX 3050 Laptop GPU
cuda
0
0
None


# Change Variable's Device

In [5]:
x = torch.rand(3,3)
print(x)
print(x.device)

tensor([[0.9805, 0.9989, 0.7576],
        [0.2057, 0.5865, 0.5063],
        [0.3485, 0.1318, 0.9407]])
cpu


In [6]:
x = x.to(device)
print(x.device)

cuda:0


# Speed Of CPU and GPU

In [3]:
x_cpu = torch.rand(10000,10000)
x_gpu =  x_cpu.to('cuda:0')

In [4]:
start = time.time()
y_cpu = x_cpu @ x_cpu
end = time.time()
print(f"CPU Time: {end - start:.4f} seconds")

CPU Time: 4.7554 seconds


In [None]:
torch.cuda.synchronize()
start = time.time()
y_gpu = x_gpu @ x_gpu
end = time.time()
print(f"GPU Time: {end - start:.4f} seconds")

GPU Time: 0.1288 seconds


# Activate Benchmark of cuDNN (Accelerating training)

In [6]:
torch.backends.cudnn.benchmark = True

# Turn On/Off Mixed Precision Training (Save VRAM and Accelerate)

| Component | Purpose |
|:---|:---|
| `autocast` | Automatically selects float16/float32 to accelerate training and reduce VRAM usage. |
| `GradScaler` | Scales the loss to prevent numerical underflow when using float16 during training. |

**NOTE**: See more in [AI/Utils/Mixed-Precision-Training](AI/Utils/Mixed-Precision-Training)

In [19]:
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 5)
)
model = model.cuda() # -> Move model to GPU

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

input = torch.randn(32,784).cuda()
target = torch.randint(0, 5, (32,)).cuda()

In [20]:
# Without Mixed Precision Training

start_time = time.time()

for epoch in range(5):
    optimizer.zero_grad()

    output = model(input) # Forward pass
    loss = loss_fn(output, target)

    loss.backward() # Backward pass
    optimizer.step()

    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")

end_time = time.time()
print(f"Total time:  {end_time - start_time:.4f} seconds")

Epoch 1: Loss = 1.5728
Epoch 2: Loss = 1.0307
Epoch 3: Loss = 0.6581
Epoch 4: Loss = 0.4125
Epoch 5: Loss = 0.2567
Total time:  0.0367 seconds


In [21]:
# With Mixed Precision Training

start_time = time.time()
scaler = torch.amp.GradScaler()

for epoch in range(5):
    optimizer.zero_grad()

    with torch.amp.autocast(device_type="cuda"):
        output = model(input) # Forward pass
        loss = loss_fn(output, target)

    scaler.scale(loss).backward() # Backward pass
    scaler.step(optimizer)
    scaler.update()

    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")

end_time = time.time()
print(f"Total time:  {end_time - start_time:.4f} seconds")

Epoch 1: Loss = 0.1596
Epoch 2: Loss = 0.0995
Epoch 3: Loss = 0.0627
Epoch 4: Loss = 0.0401
Epoch 5: Loss = 0.0262
Total time:  0.0079 seconds
