In [24]:
import torch
import torch.nn as nn

### PatchEmbedding

In [25]:
# Define toy input: Batch of 1 image, 3 channels (RGB), 8x8 size
n_samples = 1
in_channels = 3  # RGB image
img_size = 8
patch_size = 4
embed_dim = 6  # Output embedding dimension


In [26]:
# Create a dummy image tensor (batch_size=1, channels=3, height=8, width=8)
x = torch.randn(n_samples, in_channels, img_size, img_size)
x

tensor([[[[-0.2316,  0.2791,  0.6735,  1.4968,  0.0726,  0.2044, -2.5776,
            1.4435],
          [ 0.3110,  0.9913,  0.2106, -0.0137, -1.6977, -0.7567,  1.0226,
            0.4551],
          [ 0.8429,  0.5899, -0.5186, -0.5533,  0.0640, -0.9399,  1.0161,
            0.5079],
          [-0.9939,  0.8087, -0.5042, -0.7606,  0.5438,  0.7888,  0.7569,
           -1.1863],
          [-0.7801,  0.3140, -0.3803, -0.3533, -0.7400, -0.1064,  0.4969,
           -0.5699],
          [-0.1939, -1.0817,  0.2256, -0.9048,  0.7253, -0.4405,  0.0091,
           -0.0549],
          [-0.5883, -0.2510,  0.6320,  0.0982, -1.4043,  0.3159,  0.2555,
            0.2755],
          [ 0.6407, -0.6237, -2.3314, -0.0574, -0.1351, -0.8191,  0.5546,
           -0.7069]],

         [[-0.0329,  0.7785, -0.6229,  0.5435, -1.8386, -1.9724,  0.4170,
            1.2891],
          [ 1.4359, -0.6483, -0.7035,  0.3878,  1.3952,  1.5196, -0.2788,
           -2.0343],
          [-0.1489, -0.5094,  2.2068,  1.3553, -

In [27]:
# Define PatchEmbedding layer
patch_embedding = torch.nn.Conv2d(
    in_channels,
    embed_dim,
    kernel_size=patch_size,
    stride=patch_size
)
patch_embedding

Conv2d(3, 6, kernel_size=(4, 4), stride=(4, 4))

In [28]:
x_proj = patch_embedding(x)
x_proj.shape

torch.Size([1, 6, 2, 2])

In [29]:
x_proj

tensor([[[[ 0.8924,  0.3836],
          [ 0.5082,  0.2890]],

         [[ 0.5248,  0.5434],
          [-0.5994,  0.9152]],

         [[ 0.0940, -0.4738],
          [ 0.2473,  0.6958]],

         [[-0.0423, -0.1050],
          [ 0.3391, -0.3166]],

         [[ 0.3676,  0.1056],
          [-0.5193, -0.1797]],

         [[ 0.1127,  0.1437],
          [ 0.1190, -0.2127]]]], grad_fn=<ConvolutionBackward0>)

In [30]:
x_flatten = x_proj.flatten(2) # merge the the dimension of 2 and 3 into a single dimension
x_flatten.shape

torch.Size([1, 6, 4])

In [31]:
x_flatten.transpose(1, 2)

tensor([[[ 0.8924,  0.5248,  0.0940, -0.0423,  0.3676,  0.1127],
         [ 0.3836,  0.5434, -0.4738, -0.1050,  0.1056,  0.1437],
         [ 0.5082, -0.5994,  0.2473,  0.3391, -0.5193,  0.1190],
         [ 0.2890,  0.9152,  0.6958, -0.3166, -0.1797, -0.2127]]],
       grad_fn=<TransposeBackward0>)

## LayerNorm

In PyTorch: `nn.LayerNorm(normalized_shape, eps=1e-05, elementwise_affine=True)`

In [32]:
input = torch.tensor([[0, 4.], [-1, 7], [3, 5]])
input

tensor([[ 0.,  4.],
        [-1.,  7.],
        [ 3.,  5.]])

In [41]:
n_samples, n_features = input.shape

In [42]:
layernorm = nn.LayerNorm(n_features, elementwise_affine=False) #  elementwise_affine=False: no learnable parameters

In [None]:
# computes the total number of trainable parameters in the layernorm model (or layer).
sum(p.numel() for p in layernorm.parameters() if p.requires_grad)

0

In [52]:
layernorm.weight, layernorm.bias

(None, None)

In [44]:
input.mean(-1) # calculate the mean of the last dimension

tensor([2., 3., 4.])

In [45]:
"""
If unbiased=False, the standard deviation is computed using N (population standard deviation).
If unbiased=True, the standard deviation is computed using N-1 (sample standard deviation, also called Bessel’s correction).

When calculating the standard deviation of a sample, dividing by N-1 corrects the bias in estimating the population standard deviation.
This is useful in statistics when working with small sample sizes.

When to Use Each?
Use unbiased=True (default) when working with samples and need an unbiased estimator of population std.
Use unbiased=False when working with the full dataset (population statistics)
"""
input.std(-1, unbiased=False)

tensor([2., 4., 1.])

In [53]:
# applies Layer Normalization to the input tensor and then computes the mean along the last dimension.
layernorm(input).mean(-1), layernorm(input).std(-1, unbiased=False)

(tensor([0., 0., 0.]), tensor([1.0000, 1.0000, 1.0000]))

In [48]:
layernorm2 = nn.LayerNorm(n_features, elementwise_affine=True) 

In [None]:
# computes the total number of trainable parameters in the layernorm2 model (or layer).
sum(p.numel() for p in layernorm2.parameters() if p.requires_grad)

4

In [51]:
layernorm2.weight, layernorm2.bias

(Parameter containing:
 tensor([1., 1.], requires_grad=True),
 Parameter containing:
 tensor([0., 0.], requires_grad=True))

In [None]:
# Both have grad_fn, meaning they are part of the computational graph in PyTorch and support autograd.
layernorm2(input).mean(-1), layernorm2(input).std(-1, unbiased=False)

(tensor([0., 0., 0.], grad_fn=<MeanBackward1>),
 tensor([1.0000, 1.0000, 1.0000], grad_fn=<StdBackward0>))