In [1]:
import torch


- NumPy: Ideal for general-purpose numerical computations and works well on CPUs. It lacks built-in support for GPU computation and automatic differentiation.
- PyTorch: Tailored for deep learning with built-in support for GPU acceleration and automatic differentiation. It is highly suitable for training neural networks.

## Football Pitch Example

In [6]:
# Define the coordinate vectors for the length and width of the pitch
length = torch.arange(1, 10)
width = torch.arange(1, 6)

# Generate coordinate grids
grid_x, grid_y = torch.meshgrid(length, width)

print(f"grid_x: \n {grid_x}")

print(f"grid_y: \n {grid_y}")

grid_x: 
 tensor([[1, 1, 1, 1, 1],
        [2, 2, 2, 2, 2],
        [3, 3, 3, 3, 3],
        [4, 4, 4, 4, 4],
        [5, 5, 5, 5, 5],
        [6, 6, 6, 6, 6],
        [7, 7, 7, 7, 7],
        [8, 8, 8, 8, 8],
        [9, 9, 9, 9, 9]])
grid_y: 
 tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])


### Explanation
- grid_x contains the x-coordinates (length) repeated along the columns.
- grid_y contains the y-coordinates (width) repeated along the rows.

Each element (grid_x[i, j], grid_y[i, j]) corresponds to a specific coordinate on the pitch.

*Example: Locating a Football Player*

Let's say we want to place a football player at the coordinate (3, 2). This means the player is standing at the 3rd position in length and the 2nd position in width.

Using the grid indices:

i corresponds to the length index.
j corresponds to the width index.

In [8]:
i = 2
j = 1

player_position = (grid_x[i, j], grid_y[i, j])
print(f"The football player stands at coordinate: {player_position}")

The football player stands at coordinate: (tensor(3), tensor(2))


In [10]:
grid_y.shape

torch.Size([9, 5])

In [18]:
grid_x[i, 3]

tensor(3)

In [11]:
# Scalar tensors have no dimensions 
# and can be converted to standard Python numbers using .item().
grid_x[i, j].item()

3

In [12]:
grid_y[i, j].item()

2

In [14]:
# Visualizing player position

def visualize_player_pos(grid_x, grid_y, player_position):
    for i in range(grid_x.shape[0]):
        row = []
        for j in range(grid_x.shape[1]):
            if (grid_x[i, j], grid_y[i,j]) == player_position:
                row.append(f"({grid_x[i, j].item()},{grid_y[i, j].item()})*")  # Mark player position
            else:
                row.append(f"({grid_x[i, j].item()},{grid_y[i, j].item()})")
        print(" ".join(row))

In [15]:
visualize_player_pos(grid_x, grid_y, player_position)

(1,1) (1,2) (1,3) (1,4) (1,5)
(2,1) (2,2) (2,3) (2,4) (2,5)
(3,1) (3,2)* (3,3) (3,4) (3,5)
(4,1) (4,2) (4,3) (4,4) (4,5)
(5,1) (5,2) (5,3) (5,4) (5,5)
(6,1) (6,2) (6,3) (6,4) (6,5)
(7,1) (7,2) (7,3) (7,4) (7,5)
(8,1) (8,2) (8,3) (8,4) (8,5)
(9,1) (9,2) (9,3) (9,4) (9,5)


In [22]:
# Check if GPU is available
if torch.cuda.is_available():
    torch_tensor = torch_tensor.to('cuda')
    print(torch_tensor)  # Tensor is now on the GPU


In [24]:
torch.cuda.is_available()

False

In [2]:
import torch
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
 
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device:{torch.cuda.current_device()}")
       
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")

Is CUDA supported by this system? True
CUDA version: 11.8
ID of current CUDA device:0
Name of current CUDA device:NVIDIA GeForce RTX 4050 Laptop GPU


## Basic example for gradient

In [42]:
import torch 

# Create a tensor with requires_grad = True to track computations
x = torch.tensor(2.0, requires_grad=True)

# Define a simple function
y = x ** 2

# Calculate the gradient of y with respect to x
y.backward() # This computes the gradient dy/dx
print(f"Gradient of y with respect to x: {x.grad}")

Gradient of y with respect to x: 4.0


In [45]:
# Now let's use no_grad to perform operations without tracking gradients
with torch.no_grad():
    z = x + 1
    print(f'Value of z (x + 1) without tracking gradients: {z}')
    print(z.requires_grad)

Value of z (x + 1) without tracking gradients: 3.0
False


In [47]:
# Reset gradient to zero
x.grad.zero_()
x.grad

tensor(0.)

In [48]:
# Perform operations with gradients again
w = x ** 3 + 3 * x ** 2
w.backward() # This computes the gradient dw/dx
print(f'Gradient of w wrt x: {x.grad}')

Gradient of w wrt x: 24.0


In [49]:
# Now demonstrating the effect of torch.no_grad()
x.grad.zero_()

with torch.no_grad():
    q = x ** 3 + 3 * x ** 2
    q.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [50]:
x.grad

tensor(0.)

In [51]:
# Create a tensor with requires_grad=True to track computations
x = torch.tensor(2.0, requires_grad=True)

# First function: y = x^2
y = x ** 2
y.backward()
print(f'Gradient of y with respect to x after first backward: {x.grad}')  # Prints: 4.0

# Store the gradient from the first function
grad_y = x.grad.clone()

# Perform operations without tracking gradients using torch.no_grad()
with torch.no_grad():
    z = x ** 3 + 3 * x ** 2
    print(f'Value of z (x^3 + 3x^2) without tracking gradients: {z}')  # This will not affect x.grad

# Verify that the gradient from the first function is preserved
print(f'Stored gradient of y with respect to x: {grad_y}')  # Should still be 4.0

# Perform another operation with gradient tracking enabled
w = x ** 4
w.backward()
print(f'Gradient of w with respect to x after third backward (without zeroing): {x.grad}')  # Prints: 36.0


Gradient of y with respect to x after first backward: 4.0
Value of z (x^3 + 3x^2) without tracking gradients: 20.0
Stored gradient of y with respect to x: 4.0
Gradient of w with respect to x after third backward (without zeroing): 36.0


Here, w is defined as x^4. The gradient dw/dx = 4x^3, so dw/dx is 32.0 when x = 2.0. Since we did not reset the gradient, x.grad now accumulates the gradient from the previous backward pass (4.0 for y = x^2) and the new backward pass (32.0 for w = x^4). Therefore, x.grad is 4.0 + 32.0 = 36.0.

# PyTorch Tutorial: Basics

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [2]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root = "data",
    train = True,
    download = True,
    transform = ToTensor()
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root = "data",
    train = False,
    download = True,
    transform = ToTensor()
)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz


0.1%

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data\FashionMNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting data\FashionMNIST\raw\train-images-idx3-ubyte.gz to data\FashionMNIST\raw


100.0%


Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data\FashionMNIST\raw\train-labels-idx1-ubyte.gz
Extracting data\FashionMNIST\raw\train-labels-idx1-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz



10.4%

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz


100.0%


Extracting data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz


100.0%

Extracting data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw






In computer vision and deep learning, N, C, H, and W are commonly used abbreviations that represent dimensions in multi-dimensional arrays or tensors. Here's what they typically stand for:

N: Stands for the batch size or the number of samples in a batch. In deep learning, it is common to process data in batches rather than individual samples. The batch size determines how many samples are processed simultaneously during training or inference.

C: Represents the number of channels. In computer vision, an image can have multiple channels, such as Red, Green, and Blue (RGB) channels. The number of channels can also represent different features or filters in a convolutional neural network (CNN).

H: Denotes the height of an image or the number of rows in a tensor. In the context of computer vision, it refers to the vertical dimension of an image or the height of a feature map in a CNN.

W: Represents the width of an image or the number of columns in a tensor. In computer vision, it refers to the horizontal dimension of an image or the width of a feature map in a CNN

In [6]:
# Passing the 'Dataset' as an argument to 'DataLoader' 
# This wraps an iterable over our dataset, and support automatic batcvhing, sampling etc.
# Here we define batch size of 64 -> a botch of 64 features and labels
batch_size = 64

# Create data loaders
train_dataloader = DataLoader(training_data, batch_size = batch_size)
test_dataloader = DataLoader(test_data, batch_size = batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W] {X.shape}") 
    print(f"Shape of Y {y.shape}")
    print(X)
    break

Shape of X [N, C, H, W] torch.Size([64, 1, 28, 28])
Shape of Y torch.Size([64])
tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        ...,


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
        

In [10]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device}")

# Defining NN model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )
    
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork().to(device)
print(model)

Using cuda
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [13]:
model.train()

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

## Optimizing Model Parameters

In [16]:
loss_fn = nn.CrossEntropyLoss() # loss function
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3) # stochastic GD

In [17]:
# In a single training loop, the model makes predictions on the training dataset
# fed to it in batches
# and backpropagates the prediciton error to adjust model's parameters

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediciton error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

In [18]:
# We also check the model's performance against the test dataset to ensure its learning
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(
        f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n"
        )

In deep learning:

- Epoch: One complete pass through the entire training dataset.
- Batch: A subset of the training data used to update model weights in one iteration.

*TL;DR: Epoch = full dataset pass, Batch = data subset for one update.*

In [22]:
%time
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)

CPU times: total: 0 ns
Wall time: 0 ns
Epoch 1
-------------------------------
loss: 2.182179 [   64/60000]
loss: 2.160292 [ 6464/60000]
loss: 2.114250 [12864/60000]
loss: 2.138345 [19264/60000]
loss: 2.080538 [25664/60000]
loss: 2.031904 [32064/60000]
loss: 2.064902 [38464/60000]
loss: 1.978102 [44864/60000]
loss: 1.993188 [51264/60000]
loss: 1.938186 [57664/60000]
Test Error: 
 Accuracy: 52.8%, Avg loss: 1.919589 

Epoch 2
-------------------------------
loss: 1.949327 [   64/60000]
loss: 1.903763 [ 6464/60000]
loss: 1.800116 [12864/60000]
loss: 1.856154 [19264/60000]
loss: 1.737408 [25664/60000]
loss: 1.687487 [32064/60000]
loss: 1.729351 [38464/60000]
loss: 1.612513 [44864/60000]
loss: 1.645349 [51264/60000]
loss: 1.556500 [57664/60000]
Test Error: 
 Accuracy: 58.9%, Avg loss: 1.556003 

Epoch 3
-------------------------------
loss: 1.619138 [   64/60000]
loss: 1.570495 [ 6464/60000]
loss: 1.429782 [12864/60000]
loss: 1.516708 [19264/60000]
loss: 1.392405 [25664/60000]
loss: 1.3788

In [None]:
# Saving model
torch.save(model.state_dict(), "model.pth")
print("Saved as model.pth")

In [None]:
# Loading model
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("model.pth"))

In [24]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"


# Tensors

What does .squeeze()?

In [1]:
import torch

# Example output tensor with an extra dimension
outputs = torch.tensor([[0.8], [0.3], [0.9], [0.1]])
print(outputs.shape)  # Output: torch.Size([4, 1])

# Using squeeze() to remove the extra dimension
outputs_squeezed = outputs.squeeze()
print(outputs_squeezed.shape)  # Output: torch.Size([4])

torch.Size([4, 1])
torch.Size([4])


In [2]:
print(f"Outputs: {outputs}")
print(f"Outputs Squeezed: {outputs_squeezed}")

Outputs: tensor([[0.8000],
        [0.3000],
        [0.9000],
        [0.1000]])
Outputs Squeezed: tensor([0.8000, 0.3000, 0.9000, 0.1000])


In [4]:
(outputs_squeezed > 0.2).int()

tensor([1, 1, 1, 0], dtype=torch.int32)

In [5]:
outputs.size(0)

4

What does .item()?

In [6]:
import torch
import torch.nn as nn

# Example: Binary classification problem
# Suppose we have a batch of 3 predictions and corresponding labels
predictions = torch.tensor([[0.8], [0.3], [0.9]])  # logits
labels = torch.tensor([1, 0, 1])  # true labels

# Binary cross-entropy loss
criterion = nn.BCEWithLogitsLoss()
loss = criterion(predictions.squeeze(), labels.float())  # Compute loss

# Print the tensor returned by the loss function
print("Loss as tensor:", loss)  # This is a tensor with a single scalar value

# Convert the tensor to a Python float using .item()
print("Loss as scalar (Python float):", loss.item())

# Accuracy calculation
preds = (predictions.squeeze() > 0.5).int()  # Convert logits to binary predictions
correct_predictions = (preds == labels).sum()  # Count correct predictions

# Print the tensor result of the sum
print("Correct predictions as tensor:", correct_predictions)

# Convert to a Python scalar
print("Correct predictions as scalar:", correct_predictions.item())

# Use for metrics calculation
total_samples = labels.size(0)
accuracy = correct_predictions.item() / total_samples
print("Accuracy:", accuracy)


Loss as tensor: tensor(0.5222)
Loss as scalar (Python float): 0.522203266620636
Correct predictions as tensor: tensor(3)
Correct predictions as scalar: 3
Accuracy: 1.0


In [8]:
predictions.item()

RuntimeError: a Tensor with 3 elements cannot be converted to Scalar

## Why do we need squeze and float?

In [9]:
import torch
import torch.nn as nn

# Define a simple batch of model outputs (logits) and labels
logits = torch.tensor([[0.8], [0.3], [0.9], [0.1]])  # Shape: (4, 1)
labels_int = torch.tensor([1, 0, 1, 0])  # Integer labels, shape: (4,)

# Define the loss function
criterion = nn.BCEWithLogitsLoss()

# Case 1: Incorrect usage (no squeeze, no float conversion)
try:
    batch_loss_incorrect = criterion(logits, labels_int)
    print("Loss without squeeze and float:", batch_loss_incorrect.item())
except Exception as e:
    print("Error without squeeze and float:", e)

# Case 2: Incorrect usage (squeeze but no float conversion)
try:
    batch_loss_incorrect_float = criterion(logits.squeeze(), labels_int)
    print("Loss with squeeze but no float:", batch_loss_incorrect_float.item())
except Exception as e:
    print("Error with squeeze but no float:", e)

# Case 3: Correct usage (squeeze logits and convert labels to float)
try:
    batch_loss_correct = criterion(logits.squeeze(), labels_int.float())
    print("Correct loss (squeeze + float):", batch_loss_correct.item())
except Exception as e:
    print("Error in correct case:", e)

# Case 4: Correct usage (labels already as floats, just squeeze logits)
labels_float = labels_int.float()
try:
    batch_loss_correct_float = criterion(logits.squeeze(), labels_float)
    print("Correct loss (labels already float):", batch_loss_correct_float.item())
except Exception as e:
    print("Error with float labels:", e)


Error without squeeze and float: Target size (torch.Size([4])) must be the same as input size (torch.Size([4, 1]))
Error with squeeze but no float: result type Float can't be cast to the desired output type Long
Correct loss (squeeze + float): 0.577751636505127
Correct loss (labels already float): 0.577751636505127


In [15]:
# Using view
import torch
import torch.nn as nn
import torch.nn.functional as F

batch_size = 32
seq_len = 28
emb_dim = 128
num_heads = 4
head_dim = emb_dim // num_heads

Q = torch.rand(batch_size, seq_len, num_heads * head_dim)

In [6]:
Q.view(batch_size, seq_len, num_heads, head_dim)

tensor([[[[9.4619e-02, 9.0232e-01, 6.6131e-01,  ..., 5.8428e-02,
           6.6578e-01, 4.8033e-01],
          [6.2897e-01, 3.9584e-01, 7.3918e-01,  ..., 4.4627e-01,
           5.3433e-01, 2.7637e-01],
          [1.1914e-01, 5.3952e-01, 1.2978e-01,  ..., 7.8426e-01,
           3.1282e-01, 8.7217e-01],
          [2.5570e-01, 6.3406e-01, 4.3206e-01,  ..., 1.5554e-01,
           5.1569e-01, 6.8033e-01]],

         [[1.6979e-01, 2.3182e-02, 9.8731e-01,  ..., 3.7517e-01,
           5.8407e-01, 8.7816e-01],
          [7.4487e-01, 5.6965e-01, 6.2576e-01,  ..., 5.4415e-01,
           6.9752e-01, 4.9220e-01],
          [9.6668e-01, 9.9252e-01, 5.9463e-01,  ..., 5.8988e-01,
           9.5338e-01, 5.5533e-01],
          [2.6364e-01, 7.8745e-01, 5.5403e-01,  ..., 8.8815e-01,
           4.4679e-01, 1.3823e-01]],

         [[1.9543e-01, 1.8294e-01, 1.1664e-01,  ..., 4.0223e-01,
           7.7993e-01, 8.8495e-01],
          [1.3703e-01, 6.1146e-01, 4.5294e-01,  ..., 4.4185e-01,
           1.7802e-01,

In [8]:
Q.shape

torch.Size([32, 28, 128])

In [9]:
Q.view(batch_size, seq_len, num_heads, head_dim).shape

torch.Size([32, 28, 4, 32])

In [16]:
Q_linear = nn.Linear(emb_dim, num_heads * head_dim)

In [17]:
Q_hat = Q_linear(Q)

In [18]:
Q_hat.shape

torch.Size([32, 28, 128])

In [19]:
num_heads * head_dim

128

In [21]:
Q_hat.view(batch_size, seq_len, num_heads, head_dim).shape

torch.Size([32, 28, 4, 32])

In [30]:
Q = torch.rand(batch_size, seq_len, num_heads * head_dim)
Q_linear = nn.Linear(emb_dim, num_heads * head_dim)
Q_hat = Q_linear(Q)

Q_hat = Q_hat.view(batch_size, seq_len, num_heads, head_dim)


K = torch.rand(batch_size, seq_len, num_heads * head_dim)
K_linear = nn.Linear(emb_dim, num_heads * head_dim)
K_hat = K_linear(K)
K_hat = K.view(batch_size, seq_len, num_heads, head_dim)


key_out = torch.matmul(Q_hat, K_hat.transpose(-2, -1))

In [31]:
K_hat.transpose(-2, -1).shape

torch.Size([32, 28, 32, 4])

In [32]:
Q_hat.shape

torch.Size([32, 28, 4, 32])

In [29]:
Q.shape

torch.Size([32, 28, 128])

In [33]:
Q.transpose(-2,-1)

tensor([[[0.0909, 0.6132, 0.8577,  ..., 0.8918, 0.1224, 0.3543],
         [0.7814, 0.1341, 0.1922,  ..., 0.5481, 0.2141, 0.4154],
         [0.8447, 0.7150, 0.4266,  ..., 0.3919, 0.6560, 0.6337],
         ...,
         [0.2432, 0.4523, 0.3986,  ..., 0.9314, 0.6383, 0.5309],
         [0.4871, 0.3860, 0.2546,  ..., 0.0959, 0.9028, 0.9500],
         [0.4697, 0.7216, 0.0304,  ..., 0.7844, 0.2969, 0.1696]],

        [[0.9752, 0.4729, 0.5563,  ..., 0.9844, 0.1385, 0.8637],
         [0.8090, 0.1349, 0.0985,  ..., 0.7581, 0.6185, 0.0483],
         [0.0466, 0.5107, 0.6249,  ..., 0.0528, 0.6865, 0.6051],
         ...,
         [0.2516, 0.3157, 0.0164,  ..., 0.7983, 0.2466, 0.9173],
         [0.6927, 0.2813, 0.5569,  ..., 0.6450, 0.2963, 0.7299],
         [0.6675, 0.6297, 0.7989,  ..., 0.9333, 0.2770, 0.3121]],

        [[0.8325, 0.5633, 0.5766,  ..., 0.1106, 0.3868, 0.6065],
         [0.3926, 0.4704, 0.4585,  ..., 0.8406, 0.2138, 0.1335],
         [0.1487, 0.6410, 0.3817,  ..., 0.7092, 0.0149, 0.

# Positional encoding

In [36]:
batch = [
    ["I", "love", "deep", "learning", "<PAD>"],  # Sentence 1 (padded to length 5)
    ["Transformers", "are", "powerful", "<PAD>", "<PAD>"],  # Sentence 2
    ["PyTorch", "is", "amazing", "<PAD>", "<PAD>"]  # Sentence 3
]


In [39]:
torch.tensor(batch)

ValueError: too many dimensions 'str'

In [40]:
batch = torch.tensor([
    [1, 2, 3, 4, 0],  # Sentence 1
    [5, 6, 7, 0, 0],  # Sentence 2
    [8, 9, 10, 0, 0]  # Sentence 3
])

In [41]:
batch_size, seq_len = batch.shape

In [53]:
torch.arange(seq_len) + 1

tensor([1, 2, 3, 4, 5])

In [47]:
[torch.arange(seq_len).squeeze() for i in range(batch_size)]

[tensor([0, 1, 2, 3, 4]), tensor([0, 1, 2, 3, 4]), tensor([0, 1, 2, 3, 4])]

In [50]:
torch.arange(seq_len).unsqueeze(0).expand(batch_size, seq_len)


tensor([[0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4]])

In [54]:
torch.arange(seq_len).expand(batch_size, seq_len) + 1


tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])

In [55]:
torch.arange(seq_len).expand(batch_size, seq_len) + 1

tensor([[1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5],
        [1, 2, 3, 4, 5]])

In [56]:
torch.arange(seq_len)

tensor([0, 1, 2, 3, 4])

In [61]:
torch.arange(seq_len).expand(3, seq_len)

tensor([[0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4]])

# Difference between permute and view

In [62]:
import torch

# Create a 3D tensor
tensor = torch.arange(24).reshape(2, 3, 4)  # Shape: (2, 3, 4)
print("Original Tensor:")
print(tensor)
print("Shape:", tensor.shape)

# Using .view to reshape (keeps original dimension order)
viewed_tensor = tensor.view(6, 4)  # Shape: (6, 4)
print("\nViewed Tensor (reshaped using .view):")
print(viewed_tensor)
print("Shape:", viewed_tensor.shape)
# NOTE: The dimension order (2, 3, 4) remains intact, just reshaped to (6, 4).

# Using .permute to change dimension order
permuted_tensor = tensor.permute(1, 2, 0)  # Shape: (3, 4, 2)
print("\nPermuted Tensor (dimensions reordered):")
print(permuted_tensor)
print("Shape:", permuted_tensor.shape)
# NOTE: The order of dimensions has been rearranged: (2, 3, 4) -> (3, 4, 2)

# Comparing the data layout
print("\nTensor Elements in Memory:")
print("Original Tensor Flattened:", tensor.flatten())
print("Viewed Tensor Flattened:", viewed_tensor.flatten())
print("Permuted Tensor Flattened:", permuted_tensor.flatten())
# NOTE: The .view tensor retains the same data order in memory,
# while .permute alters how the data is accessed (new dimension order).


Original Tensor:
tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])
Shape: torch.Size([2, 3, 4])

Viewed Tensor (reshaped using .view):
tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]])
Shape: torch.Size([6, 4])

Permuted Tensor (dimensions reordered):
tensor([[[ 0, 12],
         [ 1, 13],
         [ 2, 14],
         [ 3, 15]],

        [[ 4, 16],
         [ 5, 17],
         [ 6, 18],
         [ 7, 19]],

        [[ 8, 20],
         [ 9, 21],
         [10, 22],
         [11, 23]]])
Shape: torch.Size([3, 4, 2])

Tensor Elements in Memory:
Original Tensor Flattened: tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23])
Viewed Tensor Flattened: tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 1

In [64]:
tensor.view(12, 2, 1)

tensor([[[ 0],
         [ 1]],

        [[ 2],
         [ 3]],

        [[ 4],
         [ 5]],

        [[ 6],
         [ 7]],

        [[ 8],
         [ 9]],

        [[10],
         [11]],

        [[12],
         [13]],

        [[14],
         [15]],

        [[16],
         [17]],

        [[18],
         [19]],

        [[20],
         [21]],

        [[22],
         [23]]])