---
# **LAB 9 - NN in CUDA Pytorch**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

In [None]:
!pip install numba-cuda==0.4.0

In [None]:
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

# 🐍 Pytorch basic

In [None]:
import torch
torch.__version__

In [None]:
x = torch.rand(2, 3)
print(x)
print(x.dtype)

In [None]:
x = torch.ones((5, 3), dtype=torch.double)
print(x)


In [None]:
# Vector
vector = torch.tensor([7, 7])
vector

In [None]:
# Matrix
matrix = torch.tensor([[7, 8],
                      [9, 10]])
matrix

In [None]:
x = torch.arange(10).reshape(2, 5)
x.shape

torch.Size([2, 5])

In [None]:
points = torch.tensor([[4.0, 1.0], [5.0, 3.0], [2.0, 1.0]])
points.storage()

In [None]:
r = (torch.rand(2, 2) - 0.5) * 2 # values between -1 and 1
print('A random matrix, r:')
print(r)

# Common mathematical operations are supported:
print('\nAbsolute value of r:')
print(torch.abs(r))

# ...as are trigonometric functions:
print('\nInverse sine of r:')
print(torch.asin(r))

# ...and linear algebra operations like determinant and singular value decomposition
print('\nDeterminant of r:')
print(torch.det(r))
print('\nSingular value decomposition of r:')
print(torch.svd(r))

# ...and statistical and aggregate operations:
print('\nAverage and standard deviation of r:')
print(torch.std_mean(r))
print('\nMaximum value of r:')
print(torch.max(r))


In [None]:
a = torch.ones(3,1)
b = torch.ones(1,3)
c = torch.ones(2, 1, 1)
print(f"shapes: a: {a.shape}, b: {b.shape}, c: {c.shape}")
d = a + b
print("d = a + b:", d.shape)
e = c * d
print("e = c * d:", e.shape)

# 🐍 Pytorch on GPU

In [None]:
import torch
# Check CUDA availability
print(torch.cuda.is_available())
# Get the current CUDA device
print(torch.cuda.current_device())
# Get the name of the current CUDA device
print(torch.cuda.get_device_name(0))

In [None]:
# Move a tensor to GPU
x = torch.tensor([1, 2, 3])
x_gpu = x.to('cuda')
print(x_gpu)

In [None]:
import torch
x = torch.tensor([[4.0, 1.0], [5.0, 3.0], [2.0, 1.0]], device='cuda')
print(x)

In [None]:
cuda = torch.device('cuda')     # Default CUDA device
cuda0 = torch.device('cuda:0')
print(cuda0)  # device(type='cuda', index=0)
cuda2 = torch.device('cuda:1')  # GPU 2 (these are 0-indexed)
print(cuda2)  # device(type='cuda', index=2)

x = torch.tensor([1., 2.], device=cuda0)
print(x.device)  # device(type='cuda', index=0)
# x.device is device(type='cuda', index=0)
y = torch.tensor([1., 2.]).cuda()
print(y.device)  # device(type='cuda', index=0)
# y.device is device(type='cuda', index=0)

with torch.cuda.device(0):
	# allocates a tensor on GPU 1
	a = torch.tensor([1., 2.], device=cuda)

	# transfers a tensor from CPU to GPU 1
	b = torch.tensor([1., 2.]).cuda()
	# a.device and b.device are device(type='cuda', index=1)

	# You can also use ``Tensor.to`` to transfer a tensor:
	b2 = torch.tensor([1., 2.]).to(device=cuda)
	# b.device and b2.device are device(type='cuda', index=1)

	c = a + b
	# c.device is device(type='cuda', index=1)

	z = x + y
	# z.device is device(type='cuda', index=0)


In [None]:
# Let's do some matrix multiplication on the CPU first
a_full = torch.randn(10240, 10240, dtype=torch.double)
b_full = torch.randn(10240, 10240, dtype=torch.double)

ab_full = a_full @ b_full # takes 280.110 ms on GA100

* TF32 (TensorFloat-32) is a precision mode that allows for faster matrix multiplications on NVIDIA Ampere and later GPUs.
* It uses 19 bits for the mantissa instead of the usual 23 bits in FP32, which can lead to faster computations with a small loss in precision.
* This is particularly useful for deep learning workloads where the precision loss is often acceptable.
* The following code demonstrates the use of TF32 in PyTorch for matrix multiplication.
* Ensure that TF32 is enabled for matrix multiplication

In [None]:
# Let's do some matrix multiplication on the GPU
a_full = torch.randn(10240, 10240, dtype=torch.double, device='cuda')
b_full = torch.randn(10240, 10240, dtype=torch.double, device='cuda')
ab_full = a_full @ b_full
mean = ab_full.abs().mean()  # 80.7277

a = a_full.float()
b = b_full.float()

start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
# Do matmul at TF32 mode.
torch.backends.cuda.matmul.allow_tf32 = True
start_event.record()
ab_tf32 = a @ b  # takes 40.432 ms on GA100
end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(f'TF32 matmul took {elapsed_time_ms:.3f} ms')
error = (ab_tf32 - ab_full).abs().max()  # 0.1747
relative_error = error / mean  # 0.0022
print(f'Max error: {error:.4f}, Relative error: {relative_error:.6f}')

# Do matmul with TF32 disabled.
torch.backends.cuda.matmul.allow_tf32 = False
start_event.record()
ab_fp32 = a @ b  # takes 280.110 ms on GA100
end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(f'TF32 matmul took {elapsed_time_ms:.3f} ms')
error = (ab_fp32 - ab_full).abs().max()  # 0.0031
relative_error = error / mean  # 0.000039
print(f'Max error: {error:.4f}, Relative error: {relative_error:.6f}')



# 🐍 Linear model

In [None]:
import torch

t_c = [0.5,  14.0, 15.0, 28.0, 11.0,  8.0,  3.0, -4.0,  6.0, 13.0, 21.0]
t_u = [35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4]
t_c = torch.tensor(t_c)
t_u = torch.tensor(t_u)

# parameters of the model
w = torch.ones(())
b = torch.zeros(())

# Let's define a model that predicts Celsius from unknown
def model(x, w, b):
   return w * x + b

# Let's define the loss function that measures the difference between
def loss_fn(y1, y2):
	squared_diffs = 0.5 * (y1 - y2)**2
	return squared_diffs.mean()

# Let's compute the predictions and the loss
t_p = model(t_u, w, b)
loss = loss_fn(t_p, t_c)

# initial values
w = torch.ones(1)
b = torch.zeros(1)
print(f"w: {w}, b: {b}")
print(f"shapes: w: {w.shape}, b: {b.shape}")
t_p = model(t_u, w, b)
print(t_p)

In [None]:
import math

delta = 0.1

loss_rate_of_change_w = \
    (loss_fn(model(t_u, w + delta, b), t_c) -
     loss_fn(model(t_u, w - delta, b), t_c)) / (2.0 * delta)
learning_rate = 1e-2
w = w - learning_rate * loss_rate_of_change_w

loss_rate_of_change_b = \
    (loss_fn(model(t_u, w, b + delta), t_c) -
     loss_fn(model(t_u, w, b - delta), t_c)) / (2.0 * delta)
b = b - learning_rate * loss_rate_of_change_b

def dloss_fn(t_p, t_c):
    dsq_diffs = 2 * (t_p - t_c) / t_p.size(0)  # <1>
    return dsq_diffs

def dmodel_dw(t_u, w, b):
    return t_u
def dmodel_db(t_u, w, b):
    return 1.0

def grad_fn(t_u, t_c, t_p, w, b):
    dloss_dtp = dloss_fn(t_p, t_c)
    dloss_dw = dloss_dtp * dmodel_dw(t_u, w, b)
    dloss_db = dloss_dtp * dmodel_db(t_u, w, b)
    return torch.stack([dloss_dw.sum(), dloss_db.sum()])  # <1>

# Let's define a training loop that will train the model
def training_loop(n_epochs, learning_rate, params, t_u, t_c):
	for epoch in range(1, n_epochs + 1):
		w, b = params

		t_p = model(t_u, w, b)  # <1>
		loss = loss_fn(t_p, t_c)
		grad = grad_fn(t_u, t_c, t_p, w, b)  # <2>

		params = params - learning_rate * grad
		N = round(math.sqrt(n_epochs))
		if not (epoch % N):
			print('Epoch %d, Loss %f' % (epoch, float(loss))) # <3>

	return params


# Let's train the model using the training loop
lr = 1e-2
params = training_loop(	n_epochs = 100,
	learning_rate = lr,
	params = torch.tensor([1.0, 0.0]),
	t_u = t_u, t_c = t_c)

print('\nFinal parameters:', params, '\n')

lr = 1e-4
params = training_loop(	n_epochs = 100,
	learning_rate = lr,
	params = torch.tensor([1.0, 0.0]),
	t_u = t_u, t_c = t_c)
print('\nFinal parameters:', params, '\n')


t_un = 0.1 * t_u
params = training_loop(
	n_epochs = 100,
	learning_rate = 1e-2,
	params = torch.tensor([1.0, 0.0]),
	t_u = t_un, # <1>
	t_c = t_c)
print('\nFinal parameters:', params, '\n')

params = training_loop(
	n_epochs = 5000,
	learning_rate = 1e-2,
	params = torch.tensor([1.0, 0.0]),
	t_u = t_un, t_c = t_c)
print('\nFinal parameters:', params, '\n')


In [None]:
params
%matplotlib inline
from matplotlib import pyplot as plt

t_p = model(t_un, *params)  # <1>

fig = plt.figure(dpi=200)
plt.xlabel("Temperature (°Fahrenheit)")
plt.ylabel("Temperature (°Celsius)")
plt.plot(t_u.numpy(), t_p.detach().numpy()) # <2>
plt.plot(t_u.numpy(), t_c.numpy(), 'o')
plt.savefig("temp_unknown_plot.png", format="png")  # bookskip
%matplotlib inline
from matplotlib import pyplot as plt

fig = plt.figure(dpi=200)
plt.xlabel("Measurement")
plt.ylabel("Temperature (°Celsius)")
plt.plot(t_u.numpy(), t_c.numpy(), 'o')

plt.savefig("temp_data_plot.png", format="png")

Using autograd...

In [None]:
loss = loss_fn(model(t_u, *params), t_c)
loss.backward()

In [None]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
params.grad is None

In [None]:
if params.grad is not None:
   params.grad.zero_()

In [None]:
def training_loop(n_epochs, learning_rate, params, t_u, t_c):
    for epoch in range(1, n_epochs + 1):
        if params.grad is not None:  # <1>
            params.grad.zero_()

        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
        loss.backward()

        with torch.no_grad():  # <2>
            params -= learning_rate * params.grad

        if epoch % 500 == 0:
            print('Epoch %d, Loss %f' % (epoch, float(loss)))

    return params

In [None]:
training_loop(
    n_epochs = 5000,
    learning_rate = 1e-2,
    params = torch.tensor([1.0, 0.0], requires_grad=True), # <1>
    t_u = t_un, # <2>
    t_c = t_c)

↩ SOLUTION: optimizer...

In [None]:
import torch.optim as optim
dir(optim)

In [None]:
# Let's use the optimizer to train the model
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-5
optimizer = optim.SGD([params], lr=learning_rate)
t_p = model(t_un, *params)
loss = loss_fn(t_p, t_c)
loss.backward()

In [None]:
# Let's use an optimizer to update the parameters
optimizer.step()
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

#
t_p = model(t_un, *params)
loss = loss_fn(t_p, t_c)

optimizer.zero_grad() # <1>
loss.backward()
optimizer.step()

# define a training loop that uses the optimizer
def training_loop(n_epochs, optimizer, params, t_u, t_c):
    for epoch in range(1, n_epochs + 1):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 500 == 0:
            print('Epoch %d, Loss %f' % (epoch, float(loss)))

    return params

# Let's train the model using the training loop with an optimizer
training_loop(
    n_epochs = 5000,
    optimizer = optimizer,
    params = params, # <1>
    t_u = t_un,
    t_c = t_c)

↩ TODO: optimizer...

In [None]:
# Let's use an optimizer to update the parameters


# define a training loop that uses the optimizer


# Let's train the model using the training loop with an optimizer


# 🐍 MLP

Example1: celsius vs fahrenheit...

In [None]:
# Import necessary libraries
import numpy as np
import torch
import torch.optim as optim

torch.set_printoptions(edgeitems=2, linewidth=75)

In [None]:
t_c = [0.5,  14.0, 15.0, 28.0, 11.0,  8.0,  3.0, -4.0,  6.0, 13.0, 21.0]
t_u = [35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4]
t_c = torch.tensor(t_c).unsqueeze(1) # <1>
t_u = torch.tensor(t_u).unsqueeze(1) # <1>

t_u.shape

In [None]:
n_samples = t_u.shape[0]
n_val = int(0.2 * n_samples)

# Shuffle the indices
shuffled_indices = torch.randperm(n_samples)

# Split the indices into training and validation sets
train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

# show the indices
train_indices, val_indices

In [None]:
#  Split the data into training and validation sets using the shuffled indices
t_u_train = t_u[train_indices]
t_c_train = t_c[train_indices]

t_u_val = t_u[val_indices]
t_c_val = t_c[val_indices]

t_un_train = 0.1 * t_u_train
t_un_val = 0.1 * t_u_val

In [None]:
import torch.nn as nn

# Define a simple linear model
linear_model = nn.Linear(1, 1)
linear_model(t_un_val)

In [None]:
print(linear_model.weight)
print(linear_model.bias)

In [None]:
x = torch.ones(10, 1)
linear_model(x)

In [None]:
linear_model = nn.Linear(1, 1)
optimizer = optim.SGD(linear_model.parameters(), lr=1e-2)

In [None]:
def training_loop(n_epochs, optimizer, model, loss_fn, t_u_train, t_u_val, t_c_train, t_c_val):
	for epoch in range(1, n_epochs + 1):
		t_p_train = model(t_u_train)
		loss_train = loss_fn(t_p_train, t_c_train)

		t_p_val = model(t_u_val)
		loss_val = loss_fn(t_p_val, t_c_val)

		optimizer.zero_grad()
		loss_train.backward()
		optimizer.step()

		if epoch == 1 or epoch % 1000 == 0:
			print(f"Epoch {epoch}, Training loss {loss_train.item():.4f},"f" Validation loss {loss_val.item():.4f}")


In [None]:
def loss_fn(t_p, t_c):
	squared_diffs = (t_p - t_c)**2
	return squared_diffs.mean()

linear_model = nn.Linear(1, 1) # <1>
optimizer = optim.SGD(linear_model.parameters(), lr=1e-2)

training_loop(
    n_epochs = 3000,
    optimizer = optimizer,
    model = linear_model,
    loss_fn = loss_fn,
    t_u_train = t_un_train,
    t_u_val = t_un_val,
    t_c_train = t_c_train,
    t_c_val = t_c_val)

print()
print(linear_model.weight)
print(linear_model.bias)

In [None]:
linear_model = nn.Linear(1, 1)
optimizer = optim.SGD(linear_model.parameters(), lr=1e-2)

training_loop(
	n_epochs = 3000,
	optimizer = optimizer,
	model = linear_model,
	loss_fn = nn.MSELoss(), #
	t_u_train = t_un_train,
	t_u_val = t_un_val,
	t_c_train = t_c_train,
	t_c_val = t_c_val)

print()
print(linear_model.weight)
print(linear_model.bias)

↩ Solution: optimizer...

In [None]:
# sequential model
seq_model = nn.Sequential(
            nn.Linear(1, 13), # <1>
            nn.Tanh(),
            nn.Linear(13, 1)) # <2>
seq_model

In [None]:
[param.shape for param in seq_model.parameters()]

In [None]:
# Let's use the optimizer to train the model
optimizer = optim.SGD(seq_model.parameters(), lr=1e-3) # <1>

# Let's define a training loop that uses the optimizer
training_loop(
	n_epochs = 5000,
	optimizer = optimizer,
	model = seq_model,
	loss_fn = nn.MSELoss(),
	t_u_train = t_un_train,
	t_u_val = t_un_val,
	t_c_train = t_c_train,
	t_c_val = t_c_val)

print('output', seq_model(t_un_val))
print('answer', t_c_val)
print('hidden', seq_model[0].weight) # <1>
print('hidden', seq_model[0].bias) # <1>



↩ TODO: optimizer...

In [None]:
# sequential model

In [None]:
# Let's use the optimizer to train the model

# Let's define a training loop that uses the optimizer


# Let's train the model using the training loop with an optimizer
print('output', seq_model(t_un_val))
print('answer', t_c_val)
print('hidden', seq_model[0].weight) # <1>
print('hidden', seq_model[0].bias) # <1>

In [None]:
from matplotlib import pyplot as plt

t_range = torch.arange(20., 90.).unsqueeze(1)

fig = plt.figure(dpi=200)
plt.xlabel("Fahrenheit")
plt.ylabel("Celsius")
plt.plot(t_u.numpy(), t_c.numpy(), 'o')
plt.plot(t_range.numpy(), seq_model(0.1 * t_range).detach().numpy(), 'c-')
plt.plot(t_u.numpy(), seq_model(0.1 * t_u).detach().numpy(), 'kx')
plt.show()

**Example 2:** A nonlinear dataset.

Use the dataset for regression:
- define a MLP model
- instantiate model, define loss and optimizer
- train the model
- plot results


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Generate synthetic non-linear dataset
torch.manual_seed(0)
x = torch.linspace(-3, 3, 100).view(-1, 1)  # Input
y = x**2 + torch.sin(x) + 0.2 * torch.randn_like(x)  # Non-linear function with noise


In [None]:
# Define MLP model
class MLPRegression(nn.Module):
	def __init__(self):
		super(MLPRegression, self).__init__()
		self.fc1 = nn.Linear(1, 64)
		self.fc2 = nn.Linear(64, 64)
		self.fc3 = nn.Linear(64, 1)
		self.activation = nn.ReLU()     # Non-linear activation

	def forward(self, x):
		x = self.activation(self.fc1(x))
		x = self.activation(self.fc2(x))
		return self.fc3(x)

# Instantiate model, define loss and optimizer
model = MLPRegression()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 500
for epoch in range(epochs):
    optimizer.zero_grad()
    y_pred = model(x)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()

# Plot results
plt.scatter(x.numpy(), y.numpy(), label="True Data", alpha=0.5)
plt.plot(x.numpy(), model(x).detach().numpy(), color='red', label="MLP Prediction")
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Non-Linear Regression with MLP")
plt.legend()
plt.show()
