In [3]:
import torch
import torch.nn as nn

### PatchEmbedding

In [4]:
# Define toy input: Batch of 1 image, 3 channels (RGB), 8x8 size
n_samples = 1
in_channels = 3  # RGB image
img_size = 8
patch_size = 4
embed_dim = 6  # Output embedding dimension


In [5]:
# Create a dummy image tensor (batch_size=1, channels=3, height=8, width=8)
x = torch.randn(n_samples, in_channels, img_size, img_size)
x

tensor([[[[-0.2633, -1.8969,  1.4743, -0.3435, -0.2999, -1.3671,  0.0964,
            0.4794],
          [-0.3527,  0.2987, -2.4611,  1.0021,  0.8559, -0.1230, -1.3567,
           -0.0323],
          [-0.2216,  2.0662, -2.3393, -0.4756,  0.9114, -0.7342, -0.9184,
            0.4454],
          [-0.7126, -0.3743,  1.1080,  0.1548, -0.0492,  0.2147,  0.6575,
           -0.1207],
          [-1.7058, -0.2490, -0.9394, -0.2000,  1.0010, -0.1594,  1.5799,
            0.1848],
          [ 0.0182, -0.1107, -0.2300, -0.5372,  0.4597, -0.9512,  2.5565,
           -0.1111],
          [ 1.3373, -0.9067,  0.9606, -1.1488,  0.7076,  0.2331,  0.9773,
           -0.7673],
          [-0.3334, -0.6814, -0.9837, -0.3038, -1.0723,  1.3645, -1.9279,
           -2.3701]],

         [[ 2.1932,  0.9822,  1.3177, -1.6895,  0.4818,  1.3684,  0.6193,
           -1.0710],
          [ 0.2484,  1.4562,  1.3519, -1.2914,  0.8432,  1.6109, -1.9271,
           -1.0391],
          [-0.5147,  0.1961, -0.5115, -0.6465, -

In [6]:
# Define PatchEmbedding layer
patch_embedding = torch.nn.Conv2d(
    in_channels,
    embed_dim,
    kernel_size=patch_size,
    stride=patch_size
)
patch_embedding

Conv2d(3, 6, kernel_size=(4, 4), stride=(4, 4))

In [7]:
x_proj = patch_embedding(x)
x_proj.shape

torch.Size([1, 6, 2, 2])

In [8]:
x_proj

tensor([[[[ 1.1967, -0.4240],
          [ 0.2409,  0.3412]],

         [[ 0.2370, -0.0734],
          [ 0.2239, -0.7524]],

         [[ 1.0445,  0.0903],
          [-0.2994, -0.4388]],

         [[ 0.4680, -0.8679],
          [ 0.3407, -0.9931]],

         [[ 0.8672,  0.6471],
          [-0.0216,  0.4335]],

         [[ 0.8862, -0.2435],
          [-0.8852,  0.4248]]]], grad_fn=<ConvolutionBackward0>)

In [9]:
x_flatten = x_proj.flatten(2) # merge the the dimension of 2 and 3 into a single dimension
x_flatten.shape

torch.Size([1, 6, 4])

In [10]:
x_flatten.transpose(1, 2)

tensor([[[ 1.1967,  0.2370,  1.0445,  0.4680,  0.8672,  0.8862],
         [-0.4240, -0.0734,  0.0903, -0.8679,  0.6471, -0.2435],
         [ 0.2409,  0.2239, -0.2994,  0.3407, -0.0216, -0.8852],
         [ 0.3412, -0.7524, -0.4388, -0.9931,  0.4335,  0.4248]]],
       grad_fn=<TransposeBackward0>)

## LayerNorm

In PyTorch: `nn.LayerNorm(normalized_shape, eps=1e-05, elementwise_affine=True)`

In [11]:
input = torch.tensor([[0, 4.], [-1, 7], [3, 5]])
input

tensor([[ 0.,  4.],
        [-1.,  7.],
        [ 3.,  5.]])

In [12]:
n_samples, n_features = input.shape

In [13]:
layernorm = nn.LayerNorm(n_features, elementwise_affine=False) #  elementwise_affine=False: no learnable parameters

In [14]:
# computes the total number of trainable parameters in the layernorm model (or layer).
sum(p.numel() for p in layernorm.parameters() if p.requires_grad)

0

In [15]:
layernorm.weight, layernorm.bias

(None, None)

In [16]:
input.mean(-1) # calculate the mean of the last dimension

tensor([2., 3., 4.])

In [17]:
"""
If unbiased=False, the standard deviation is computed using N (population standard deviation).
If unbiased=True, the standard deviation is computed using N-1 (sample standard deviation, also called Bessel’s correction).

When calculating the standard deviation of a sample, dividing by N-1 corrects the bias in estimating the population standard deviation.
This is useful in statistics when working with small sample sizes.

When to Use Each?
Use unbiased=True (default) when working with samples and need an unbiased estimator of population std.
Use unbiased=False when working with the full dataset (population statistics)
"""
input.std(-1, unbiased=False)

tensor([2., 4., 1.])

In [18]:
# applies Layer Normalization to the input tensor and then computes the mean along the last dimension.
layernorm(input).mean(-1), layernorm(input).std(-1, unbiased=False)

(tensor([0., 0., 0.]), tensor([1.0000, 1.0000, 1.0000]))

In [19]:
layernorm2 = nn.LayerNorm(n_features, elementwise_affine=True) 

In [20]:
# computes the total number of trainable parameters in the layernorm2 model (or layer).
sum(p.numel() for p in layernorm2.parameters() if p.requires_grad)

4

In [21]:
layernorm2.weight, layernorm2.bias

(Parameter containing:
 tensor([1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0.], requires_grad=True))

In [22]:
# Both have grad_fn, meaning they are part of the computational graph in PyTorch and support autograd.
layernorm2(input).mean(-1), layernorm2(input).std(-1, unbiased=False)

(tensor([0., 0., 0.], grad_fn=<MeanBackward1>),
 tensor([1.0000, 1.0000, 1.0000], grad_fn=<StdBackward0>))

# CLS token

In [23]:
# Parameters
batch_size = 4      # number of images in a batch
n_patches = 10      # number of patches per image
embed_dim = 8       # embedding dimension for each patch/token


In [24]:
# Simulate patch embeddings for a batch of images.
# Shape: (batch_size, n_patches, embed_dim)
patch_embeddings = torch.randn(batch_size, n_patches, embed_dim)
print("Patch embeddings shape:", patch_embeddings.shape)  # (4,

Patch embeddings shape: torch.Size([4, 10, 8])


In [25]:
patch_embeddings

tensor([[[ 1.6950e+00, -7.2475e-01,  6.8917e-01, -1.2845e+00,  1.0524e+00,
           7.2389e-01,  5.6165e-01, -9.3830e-01],
         [-9.1075e-01, -8.0083e-01,  2.2623e+00,  3.3333e-02, -9.1884e-02,
           1.4499e+00,  3.6943e-01,  1.1622e+00],
         [-1.4999e+00,  4.4801e-01, -1.3272e+00, -8.3523e-01, -2.6751e-01,
           7.2506e-01, -1.3675e+00,  1.8649e-01],
         [ 1.1008e+00,  2.8005e-01, -1.6325e-01,  1.5789e+00,  5.5332e-01,
          -1.9090e-01, -1.3704e+00, -1.1637e+00],
         [ 8.9358e-01, -2.4815e-01,  7.7868e-01,  4.9711e-01, -7.8787e-01,
          -1.0976e+00, -8.3230e-01, -3.2411e-01],
         [-1.0501e+00,  4.1691e-04,  2.0845e+00, -4.0261e-01,  1.5550e+00,
           2.0501e+00,  3.8149e-01, -9.6061e-01],
         [-1.0522e+00,  8.1602e-01, -9.9924e-01, -2.4195e+00,  3.5525e-01,
          -6.8756e-02,  4.3238e-02,  2.4476e+00],
         [-1.1990e-01,  5.2804e-02, -2.7216e-01, -1.0275e+00, -3.4242e-01,
          -3.5289e-01,  3.7096e-01, -4.1317e-01],


In [26]:
# Initialize the learnable classification token (cls_token)
# It is defined as (1, 1, embed_dim) meaning:
#   1: placeholder for a single token
#   1: one token (the classification token itself)
#   embed_dim: the token's embedding dimension
cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
print("cls_token shape:", cls_token.shape)  # (1, 1, 8)


cls_token shape: torch.Size([1, 1, 8])


In [27]:
cls_token # one token (the classification token itself)

Parameter containing:
tensor([[[0., 0., 0., 0., 0., 0., 0., 0.]]], requires_grad=True)

In [56]:
cls_token.unsqueeze(3).shape

torch.Size([1, 1, 8, 1])

In [48]:
# In order to prepend the cls_token to every image in the batch,
# we expand it along the batch dimension.
# This does not create new data; it simply views the same parameter for each item.
expanded_cls_token = cls_token.expand(batch_size, -1, -1) # -1 means "keep the original size for that dimension"
print("Expanded cls_token shape:", expanded_cls_token.shape)  # (4, 1, 8)

Expanded cls_token shape: torch.Size([4, 1, 8])


In [49]:
cls_token.expand(batch_size, -1, -1, -1).shape

torch.Size([4, 1, 1, 8])

In [29]:
# Concatenate the cls_token with the patch embeddings along the token dimension (dim=1)
# The resulting tensor shape will be (batch_size, n_patches + 1, embed_dim)

# since the cls_token is prepended to the entire batch of image patches, i.e., n_patches + 1 tokens, the dimension of cls_token is embed_dim.
tokens = torch.cat([expanded_cls_token, patch_embeddings], dim=1)
print("Tokens shape after concatenation:", tokens.shape)  # (4, 11, 8)

Tokens shape after concatenation: torch.Size([4, 11, 8])


In [30]:
# Then we add a learnable positional embedding to the tokens
# Initialize the positional embeddings
# Note: The positional embeddings are shared across the batch
n_positions = n_patches + 1  # number of tokens
positional_embeddings = nn.Parameter(torch.randn(n_positions, embed_dim))
print("Positional embeddings shape:", positional_embeddings.shape)  # (11, 8)

Positional embeddings shape: torch.Size([11, 8])


# torch.rand() and torch.randn()

torch.rand()
- Generates random numbers from a uniform distribution between 0 and 1.
- Every number in the range [0, 1) has an equal probability of being sampled.
- Useful when you need random values bounded within a fixed range.

In [37]:
torch.rand((1, 2))

tensor([[0.2029, 0.4354]])

torch.randn()
- Generates random numbers from a normal (Gaussian) distribution with:
    - Mean = 0
    - Standard deviation = 1
- Values are centered around 0, with both positive and negative values.
- Useful for initializing weights in neural networks and sampling from normal distributions.

In [35]:
torch.randn((1, 2))

tensor([[-1.0208,  0.0275]])