In [2]:
import torch
import torch.nn as nn

### PatchEmbedding

In [3]:
# Define toy input: Batch of 1 image, 3 channels (RGB), 8x8 size
n_samples = 1
in_channels = 3  # RGB image
img_size = 8
patch_size = 4
embed_dim = 6  # Output embedding dimension


In [4]:
# Create a dummy image tensor (batch_size=1, channels=3, height=8, width=8)
x = torch.randn(n_samples, in_channels, img_size, img_size)
x

tensor([[[[-3.9691e-01,  1.1692e+00, -7.5482e-01,  1.4442e+00,  1.6648e+00,
            1.4124e+00, -1.3579e+00, -3.1046e-01],
          [ 4.0357e-01, -1.4947e+00,  2.2118e-01, -1.4521e-01, -1.7081e+00,
           -1.7768e-01, -8.0303e-01, -4.6541e-01],
          [-2.3015e-01, -1.6651e-01, -6.9584e-01,  1.7793e+00, -3.2247e-01,
           -1.7106e-01,  1.0860e+00, -8.9969e-01],
          [-1.3071e-01, -5.7178e-02, -6.1293e-01, -1.5832e-02, -2.7760e-01,
            8.5831e-01, -2.0073e+00,  7.9900e-01],
          [ 6.9629e-01, -6.6179e-01, -1.5444e+00,  2.1999e+00,  1.6338e+00,
           -6.5353e-02,  1.0792e+00,  5.6553e-01],
          [-7.6608e-01,  1.0479e+00, -1.9106e-01,  9.7363e-01,  8.7288e-01,
            1.2817e-01,  1.1497e+00,  2.5757e+00],
          [ 1.9283e-01, -9.0032e-01,  9.7503e-01,  8.6702e-01, -2.5543e+00,
           -1.5097e+00,  8.7910e-01, -7.5360e-02],
          [-3.8971e-01, -6.9738e-02,  1.3826e+00, -1.6055e+00,  1.1987e+00,
            1.6613e-01, -1.0631e+00

In [5]:
# Define PatchEmbedding layer
patch_embedding = torch.nn.Conv2d(
    in_channels,
    embed_dim,
    kernel_size=patch_size,
    stride=patch_size
)
patch_embedding

Conv2d(3, 6, kernel_size=(4, 4), stride=(4, 4))

In [6]:
x_proj = patch_embedding(x)
x_proj.shape

torch.Size([1, 6, 2, 2])

In [7]:
x_proj

tensor([[[[-0.0823, -0.0094],
          [ 0.1171,  0.0246]],

         [[-0.2331,  0.0752],
          [ 0.4173,  0.0734]],

         [[-0.3296, -0.2042],
          [-0.0456, -0.3677]],

         [[-1.2090,  0.5632],
          [ 0.5150,  0.6201]],

         [[-0.0713,  0.0829],
          [-0.2245, -0.2623]],

         [[ 0.0262,  0.7794],
          [ 0.2296, -0.7697]]]], grad_fn=<ConvolutionBackward0>)

In [8]:
x_flatten = x_proj.flatten(2) # merge the the dimension of 2 and 3 into a single dimension
x_flatten.shape

torch.Size([1, 6, 4])

In [9]:
x_flatten.transpose(1, 2)

tensor([[[-0.0823, -0.2331, -0.3296, -1.2090, -0.0713,  0.0262],
         [-0.0094,  0.0752, -0.2042,  0.5632,  0.0829,  0.7794],
         [ 0.1171,  0.4173, -0.0456,  0.5150, -0.2245,  0.2296],
         [ 0.0246,  0.0734, -0.3677,  0.6201, -0.2623, -0.7697]]],
       grad_fn=<TransposeBackward0>)

## LayerNorm

In PyTorch: `nn.LayerNorm(normalized_shape, eps=1e-05, elementwise_affine=True)`

In [10]:
input = torch.tensor([[0, 4.], [-1, 7], [3, 5]])
input

tensor([[ 0.,  4.],
        [-1.,  7.],
        [ 3.,  5.]])

In [11]:
n_samples, n_features = input.shape

In [12]:
layernorm = nn.LayerNorm(n_features, elementwise_affine=False) #  elementwise_affine=False: no learnable parameters

In [13]:
# computes the total number of trainable parameters in the layernorm model (or layer).
sum(p.numel() for p in layernorm.parameters() if p.requires_grad)

0

In [14]:
layernorm.weight, layernorm.bias

(None, None)

In [15]:
input.mean(-1) # calculate the mean of the last dimension

tensor([2., 3., 4.])

In [16]:
"""
If unbiased=False, the standard deviation is computed using N (population standard deviation).
If unbiased=True, the standard deviation is computed using N-1 (sample standard deviation, also called Bessel’s correction).

When calculating the standard deviation of a sample, dividing by N-1 corrects the bias in estimating the population standard deviation.
This is useful in statistics when working with small sample sizes.

When to Use Each?
Use unbiased=True (default) when working with samples and need an unbiased estimator of population std.
Use unbiased=False when working with the full dataset (population statistics)
"""
input.std(-1, unbiased=False)

tensor([2., 4., 1.])

In [17]:
# applies Layer Normalization to the input tensor and then computes the mean along the last dimension.
layernorm(input).mean(-1), layernorm(input).std(-1, unbiased=False)

(tensor([0., 0., 0.]), tensor([1.0000, 1.0000, 1.0000]))

In [18]:
layernorm2 = nn.LayerNorm(n_features, elementwise_affine=True) 

In [19]:
# computes the total number of trainable parameters in the layernorm2 model (or layer).
sum(p.numel() for p in layernorm2.parameters() if p.requires_grad)

4

In [20]:
layernorm2.weight, layernorm2.bias

(Parameter containing:
 tensor([1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0.], requires_grad=True))

In [21]:
# Both have grad_fn, meaning they are part of the computational graph in PyTorch and support autograd.
layernorm2(input).mean(-1), layernorm2(input).std(-1, unbiased=False)

(tensor([0., 0., 0.], grad_fn=<MeanBackward1>),
 tensor([1.0000, 1.0000, 1.0000], grad_fn=<StdBackward0>))

# CLS token

In [22]:
# Parameters
batch_size = 4      # number of images in a batch
n_patches = 10      # number of patches per image
embed_dim = 8       # embedding dimension for each patch/token


In [23]:
# Simulate patch embeddings for a batch of images.
# Shape: (batch_size, n_patches, embed_dim)
patch_embeddings = torch.randn(batch_size, n_patches, embed_dim)
print("Patch embeddings shape:", patch_embeddings.shape)  # (4,

Patch embeddings shape: torch.Size([4, 10, 8])


In [24]:
patch_embeddings

tensor([[[ 1.9474e+00,  5.3887e-01,  1.3133e+00, -2.7690e+00,  9.5592e-01,
           4.3726e-01,  5.3443e-01, -1.1833e+00],
         [ 9.7502e-01,  5.1019e-01,  8.5312e-01, -1.3665e+00,  1.5091e+00,
           3.9398e-02, -4.6469e-02, -1.4468e+00],
         [ 1.0401e+00,  7.1583e-01,  3.8244e-01,  2.2281e-01, -6.0818e-01,
           2.5477e-01, -6.4580e-01, -2.5676e-01],
         [ 4.9179e-01,  8.5362e-01, -2.4100e+00, -1.7105e+00, -3.3873e-01,
           1.0078e+00, -2.0807e-01,  7.6522e-01],
         [-2.7206e-01,  6.1063e-02, -4.2747e-01,  5.7891e-02,  1.9277e+00,
           8.3726e-01,  3.9668e-02, -9.4248e-01],
         [ 9.0130e-01, -3.4441e-01, -2.1394e+00,  1.3755e-01, -1.5094e+00,
           2.2980e+00, -3.9261e-01, -5.2776e-01],
         [ 1.6774e-01,  2.0639e-01,  6.8004e-01, -4.5908e-01, -1.3388e+00,
           5.9610e-01, -4.7307e-01, -3.0141e+00],
         [ 2.4464e-01,  1.0840e+00, -5.7899e-01, -1.3965e+00, -6.8272e-01,
           9.1381e-01, -4.6850e-01,  2.1077e-01],


In [25]:
# Initialize the learnable classification token (cls_token)
# It is defined as (1, 1, embed_dim) meaning:
#   1: placeholder for a single token
#   1: one token (the classification token itself)
#   embed_dim: the token's embedding dimension
cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
print("cls_token shape:", cls_token.shape)  # (1, 1, 8)


cls_token shape: torch.Size([1, 1, 8])


In [26]:
cls_token # one token (the classification token itself)

Parameter containing:
tensor([[[0., 0., 0., 0., 0., 0., 0., 0.]]], requires_grad=True)

In [27]:
cls_token.unsqueeze(3).shape

torch.Size([1, 1, 8, 1])

In [28]:
# In order to prepend the cls_token to every image in the batch,
# we expand it along the batch dimension.
# This does not create new data; it simply views the same parameter for each item.
expanded_cls_token = cls_token.expand(batch_size, -1, -1) # -1 means "keep the original size for that dimension"
print("Expanded cls_token shape:", expanded_cls_token.shape)  # (4, 1, 8)

Expanded cls_token shape: torch.Size([4, 1, 8])


In [29]:
cls_token.expand(batch_size, -1, -1, -1).shape

torch.Size([4, 1, 1, 8])

In [30]:
# Concatenate the cls_token with the patch embeddings along the token dimension (dim=1)
# The resulting tensor shape will be (batch_size, n_patches + 1, embed_dim)

# since the cls_token is prepended to the entire batch of image patches, i.e., n_patches + 1 tokens, the dimension of cls_token is embed_dim.
tokens = torch.cat([expanded_cls_token, patch_embeddings], dim=1)
print("Tokens shape after concatenation:", tokens.shape)  # (4, 11, 8)

Tokens shape after concatenation: torch.Size([4, 11, 8])


In [31]:
# Then we add a learnable positional embedding to the tokens
# Initialize the positional embeddings
# Note: The positional embeddings are shared across the batch
n_positions = n_patches + 1  # number of tokens
positional_embeddings = nn.Parameter(torch.randn(n_positions, embed_dim))
print("Positional embeddings shape:", positional_embeddings.shape)  # (11, 8)

Positional embeddings shape: torch.Size([11, 8])


# torch.rand() and torch.randn()

torch.rand()
- Generates random numbers from a uniform distribution between 0 and 1.
- Every number in the range [0, 1) has an equal probability of being sampled.
- Useful when you need random values bounded within a fixed range.

In [32]:
torch.rand((1, 2))

tensor([[0.4325, 0.4274]])

torch.randn()
- Generates random numbers from a normal (Gaussian) distribution with:
    - Mean = 0
    - Standard deviation = 1
- Values are centered around 0, with both positive and negative values.
- Useful for initializing weights in neural networks and sampling from normal distributions.

In [33]:
torch.randn((1, 2))

tensor([[-0.0177, -0.0786]])