In [1]:
import torch

In [2]:
data = [[1, 2, 3], [4, 5, 6]]
my_tensor = torch.tensor(data)
print(my_tensor)

tensor([[1, 2, 3],
        [4, 5, 6]])


In [3]:
shape = (2, 3)
ones = torch.ones(shape)
zeros = torch.zeros(shape)
random = torch.rand(shape)

print(random)

tensor([[0.1939, 0.3376, 0.9500],
        [0.8846, 0.3636, 0.9549]])


In [4]:
template = torch.tensor([[1, 2, 3], [4, 5, 6]])

rand_like = torch.rand_like(template, dtype=torch.float32)

print(rand_like)

tensor([[0.4719, 0.0345, 0.7594],
        [0.7681, 0.5464, 0.4059]])


In [5]:
tensor = torch.rand(3, 4)

print(f"Shape: {tensor.shape}")
print(f"Datatype: {tensor.dtype}")
print(f"Device: {tensor.device}")

Shape: torch.Size([3, 4])
Datatype: torch.float32
Device: cpu


In [6]:
x_data = torch.tensor([[1, 2, 3], [4, 5, 6]])
w = torch.rand_like(x_data, dtype=torch.float32, requires_grad=True)

print(f"Data tensor requires grad: {x_data.requires_grad}")
print(f"Weights tensor requires grad: {w.requires_grad}")

Data tensor requires grad: False
Weights tensor requires grad: True


y = a + b, 
z = x * y

In [8]:
a = torch.tensor(2.0, requires_grad=True)
b = torch.tensor(3.0, requires_grad=True)
x = torch.tensor(4.0, requires_grad=True)

In [9]:
y = a + b
z = x * y

In [11]:
y, z

(tensor(5., grad_fn=<AddBackward0>), tensor(20., grad_fn=<MulBackward0>))

In [12]:
print(f"grad_fn for y: {y.grad_fn}")
print(f"grad_fn for z: {z.grad_fn}")

print(f"grad_fn for a: {a.grad_fn}")

grad_fn for y: <AddBackward0 object at 0x138c3c4f0>
grad_fn for z: <MulBackward0 object at 0x1079fc070>
grad_fn for a: None


In [13]:
a = torch.tensor([[1,2], [3, 4]])
b = torch.tensor([[5,6], [7, 8]])

a * b

tensor([[ 5, 12],
        [21, 32]])

In [14]:
a @ b

tensor([[19, 22],
        [43, 50]])

In [15]:
m1 = torch.tensor([[1, 2, 3], [4, 5, 6]])
m2 = torch.tensor([[7, 8], [9, 10], [11, 12]])

m1 @ m2

tensor([[ 58,  64],
        [139, 154]])

In [16]:
torch.matmul(m1, m2)

tensor([[ 58,  64],
        [139, 154]])

In [18]:
scores = torch.tensor([[10.0,20,30.0], [5.0,10.0,15.0]])
average = scores.mean()
average


tensor(15.)

In [24]:
avg_per_student = scores.mean(dim=1, keepdims=True)
avg_per_assignment = scores.mean(dim=0, keepdims=True)
avg_per_student, avg_per_student.shape, avg_per_assignment, avg_per_assignment.shape

(tensor([[20.],
         [10.]]),
 torch.Size([2, 1]),
 tensor([[ 7.5000, 15.0000, 22.5000]]),
 torch.Size([1, 3]))

In [25]:
scores = torch.tensor([
    [10, 0, 5, 20, 1],
    [1, 30, 2, 5, 0]
])

best_indices = scores.argmax(dim=1)
best_indices

tensor([3, 1])

In [28]:
scores.argmax(dim=0)

tensor([0, 1, 0, 0, 0])

In [30]:
data = torch.tensor([
    [10, 11, 12, 13],
    [20, 21, 22, 23],
    [30, 31, 32, 33]
])

indices_to_select = torch.tensor([[2], [0]])

selected_data = data.gather(dim=1, index=indices_to_select)
selected_data

tensor([[12],
        [20]])

In [31]:
selected_data = data.gather(dim=0, index=indices_to_select)
selected_data

tensor([[30],
        [10]])

# 1. Forward pass

## 1.1 Data (fake)


In [43]:
# batch size (data sample)
N = 10
# each data point will have D_in input feature and D_out output feature
D_in, D_out = 1, 1

torch.manual_seed(0)
# Create random input data X
X = torch.randn(N, D_in)

# create target output data Y by using "true" weights and adding some noise
true_w = torch.tensor([[2.0]])
true_b = torch.tensor(1.0)
torch.manual_seed(0)
y_true = X @ true_w + true_b + 0.1 * torch.randn(N, D_out)

In [44]:
y_true

tensor([[ 4.2361],
        [ 0.3838],
        [-3.5755],
        [ 2.1937],
        [-1.2775],
        [-1.9371],
        [ 1.8470],
        [ 2.7599],
        [-0.5104],
        [ 0.1530]])

In [45]:
# initialise weights and bias
torch.manual_seed(0)
W = torch.randn(D_in, D_out, requires_grad=True)
b = torch.randn(D_out, requires_grad=True)

In [46]:
print(f"Initial weights: {W}")
print(f"Initial bias: {b}")

Initial weights: tensor([[1.5410]], requires_grad=True)
Initial bias: tensor([-0.2934], requires_grad=True)


In [47]:
y_hat = X @ W + b # foward pass

In [48]:
y_hat, y_true

(tensor([[ 2.0812],
         [-0.7456],
         [-3.6509],
         [ 0.5825],
         [-1.9647],
         [-2.4487],
         [ 0.3281],
         [ 0.9980],
         [-1.4018],
         [-0.9150]], grad_fn=<AddBackward0>),
 tensor([[ 4.2361],
         [ 0.3838],
         [-3.5755],
         [ 2.1937],
         [-1.2775],
         [-1.9371],
         [ 1.8470],
         [ 2.7599],
         [-0.5104],
         [ 0.1530]]))

# 2. backward pass
`loss.backward` tells pytorch tpo travel backward from the loss and calculate gradients for all parameters with `requires_grad=True`

In [49]:
error = (y_hat - y_true)
squared_error = error ** 2
loss = squared_error.mean()
loss

tensor(1.6601, grad_fn=<MeanBackward0>)

In [50]:
loss.backward() ## compute gradients

In [51]:
print(f"Gradient of W: {W.grad}")
print(f"Gradient of b: {b.grad}")

Gradient of W: tensor([[-0.6594]])
Gradient of b: tensor([-2.2820])


# 3. Training loop

In [52]:
# Hyperparameters
learning_rate, epochs = 0.01, 100

# Re-initialize weights and bias
torch.manual_seed(0)
W, b = torch.randn(D_in, D_out, requires_grad=True), torch.randn(D_out, requires_grad=True)

# Training loop
for epoch in range(epochs):
    # Forward pass: compute predicted y
    y_hat = X @ W + b

    # compute and print loss
    loss = torch.mean((y_hat - y_true) ** 2)

    # Backward pass
    loss.backward()

    # Update parameters using gradient descent
    with torch.no_grad():
        W -= learning_rate * W.grad
        b -= learning_rate * b.grad

        # Manually zero the gradients after updating weights
        W.grad.zero_()
        b.grad.zero_()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: loss = {loss.item()}, W = {W.squeeze().item()}, b = {b.item()}")
        print(f"True W = {true_w.squeeze().item()}, True b = {true_b.item()}")

Epoch 0: loss = 1.66008722782135, W = 1.5475902557373047, b = -0.2706092894077301
True W = 2.0, True b = 1.0
Epoch 10: loss = 1.1799218654632568, W = 1.611316204071045, b = -0.06423696875572205
True W = 2.0, True b = 1.0
Epoch 20: loss = 0.840751051902771, W = 1.6703565120697021, b = 0.10745744407176971
True W = 2.0, True b = 1.0
Epoch 30: loss = 0.6002843976020813, W = 1.7241860628128052, b = 0.25056958198547363
True W = 2.0, True b = 1.0
Epoch 40: loss = 0.42928481101989746, W = 1.772666096687317, b = 0.3700633943080902
True W = 2.0, True b = 1.0
Epoch 50: loss = 0.3073905110359192, W = 1.8159093856811523, b = 0.46999427676200867
True W = 2.0, True b = 1.0
Epoch 60: loss = 0.22033162415027618, W = 1.8541849851608276, b = 0.5536853671073914
True W = 2.0, True b = 1.0
Epoch 70: loss = 0.15805687010288239, W = 1.8878511190414429, b = 0.6238675713539124
True W = 2.0, True b = 1.0
Epoch 80: loss = 0.11345567554235458, W = 1.917310118675232, b = 0.6827914714813232
True W = 2.0, True b = 1.

# 4. Torch.nn Module

In [54]:
# Input, output dims
D_in = 1
D_out = 1

# Create the Linear layer LEGO brick
linear_layer = torch.nn.Linear(in_features=D_in, out_features=D_out)

# Look inside the parameters the torch.nn.Linear() API created
print(f"layer's WWEight (W): {linear_layer.weight}")
print(f"layer's Bias (b): {linear_layer.bias}")

# Use the layer to perform a forward pass on input data X
y_hat_nn = linear_layer(X)

print(f"Predicted y for first 3 inputs (using torch.nn.Linear): {y_hat_nn[:3]}")

layer's WWEight (W): Parameter containing:
tensor([[-0.0198]], requires_grad=True)
layer's Bias (b): Parameter containing:
tensor([0.7929], requires_grad=True)
Predicted y for first 3 inputs (using torch.nn.Linear): tensor([[0.7624],
        [0.7987],
        [0.8361]], grad_fn=<SliceBackward0>)


In [56]:
list(linear_layer.parameters())

[Parameter containing:
 tensor([[-0.0198]], requires_grad=True),
 Parameter containing:
 tensor([0.7929], requires_grad=True)]

In [57]:
linear_layer.state_dict()

OrderedDict([('weight', tensor([[-0.0198]])), ('bias', tensor([0.7929]))])

## Non-linearity
ReLu, tanh, sigmoid


In [58]:
relu = torch.nn.ReLU()
sample_date = torch.tensor([-1.0, 2.0, -3.0, 4.0])
activated_data = relu(sample_date)

print(f"original data: {sample_date}")
print(f"ReLU activated data: {activated_data}")

original data: tensor([-1.,  2., -3.,  4.])
ReLU activated data: tensor([0., 2., 0., 4.])


In [59]:
gelu = torch.nn.GELU()
sample_date = torch.tensor([-1.0, 2.0, -3.0, 4.0])
activated_data = gelu(sample_date)

print(f"original data: {sample_date}")
print(f"GELU activated data: {activated_data}")

original data: tensor([-1.,  2., -3.,  4.])
GELU activated data: tensor([-0.1587,  1.9545, -0.0040,  3.9999])


In [61]:
softmax = torch.nn.Softmax(dim=-1)
# Raw mmodel scores for 2 itesm, across 4 possible classes
logits = torch.tensor([[1.0, 3.0, 0.5, 1.5], [-1.0, 2.0, 1.0, 0.0]])
probabilities = softmax(logits)

print(f"output probabilities: {probabilities}")
print(f"Sum of probabilities per item: {probabilities.sum(dim=1)}")

output probabilities: tensor([[0.0939, 0.6942, 0.0570, 0.1549],
        [0.0321, 0.6439, 0.2369, 0.0871]])
Sum of probabilities per item: tensor([1.0000, 1.0000])


In [62]:
import torch
import torch.nn as nn

class LinearRegressionModel(nn.Module):
    def __init__(self, in_features: int, out_features: int):
        super().__init__() #<- almost everything in PyTorch inherits from nn.Module
        # define layers in the constructor
        self.linear_layer = nn.Linear(in_features=in_features,
                                      out_features=out_features)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # define the forward pass - connecting the layers
        return self.linear_layer(x)

# create an instance of the model
model = LinearRegressionModel(in_features=1, out_features=1)
print(model)

LinearRegressionModel(
  (linear_layer): Linear(in_features=1, out_features=1, bias=True)
)


In [64]:
import torch.optim as optim

# Hyperparameters
learning_rate = 0.01

# Create an Adam optimizer
# pass the model's parameters to optimize
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Setup loss function
loss_fn = nn.MSELoss()

In [65]:
## FInal clean teraining loop
torch.manual_seed(42)
epochs = 200

for epoch in range(epochs):
    model.train()  # set the model to training mode

    # 1. Forwar pass
    y_pred = model(X)

    # 2. Calculate the loss
    loss = loss_fn(y_pred, y_true)

    ### The three line mantra ###
    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Backpropagation compute gradients
    loss.backward()

    # 5. Step the optimizer (perform gradient descent)
    optimizer.step()

    #print progress every 20 epochs
    if epoch % 20 == 0:
        print(f"Epoch: {epoch:02d} | Loss: {loss.item():.4f}")
   

Epoch: 00 | Loss: 5.5114
Epoch: 20 | Loss: 4.5390
Epoch: 40 | Loss: 3.6983
Epoch: 60 | Loss: 2.9786
Epoch: 80 | Loss: 2.3716
Epoch: 100 | Loss: 1.8660
Epoch: 120 | Loss: 1.4501
Epoch: 140 | Loss: 1.1123
Epoch: 160 | Loss: 0.8418
Epoch: 180 | Loss: 0.6282
