In [1]:
import torch
import torch.nn as nn

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
class DummyTransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        # a simple placeholder
    def forward(self,x):
        # placeholder code
        return x

In [3]:
torch.manual_seed(123)
batch_example = torch.randn(2,5)
layer = nn.Sequential(nn.Linear(5,6),nn.ReLU())
out = layer(batch_example)
print(out)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


Before applying layer normalization to these outputs, let's examine the mean and variance

In [6]:
mean = out.mean(dim=-1,keepdim=True)
var = out.var(dim=-1,keepdim=True)
print('Mean\n',mean)
print('Var\n',var)

Mean
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Var
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


Note: using keepdim = True in operations like mean or variance ensures that the output tensor retains the same number of dimensions as the input tensor.

In [7]:
out_norm = (out-mean)/torch.sqrt(var)
mean = out_norm.mean(dim=-1,keepdim=True)
var = out_norm.var(dim=-1,keepdim=True)
print("Normalized layer outputs:\n",out_norm)
print("Mean\n",mean)
print("Variance\n",var)

Normalized layer outputs:
 tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Mean
 tensor([[9.9341e-09],
        [0.0000e+00]], grad_fn=<MeanBackward1>)
Variance
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


To improve readability, we can also turn off the scientific notation when printing tensor values by setting sci_mode = False

In [8]:
torch.set_printoptions(sci_mode=False)
print('Mean:\n',mean)
print('Variance:\n',var)

Mean:
 tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [9]:
class LayerNorm(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean = x.mean(dim=-1,keepdim=True)
        var = x.var(dim=-1,keepdim=True,unbiased=False)
        norm_x = (x-mean)/torch.sqrt(var + self.eps)
        return self.scale*norm_x + self.shift
        

This specific implementation of layer Normalization operates on the last dimension of input tensor x,which represents the columns
* we add a small constant epsilon to the variance to prevent division by zero during noramlization
* The scale and shift are two trainable parameters (of the same dimension as the input) that the llm automatically adjusts during training. This would improve the model's performance on its training task.
* This allows the model to learn appropriate scaling and shifting that best suit the data it is processing.

Note on biased variance :
* In our variance calculation method, we have opted for an implementation detail by setting unbiased=False
* In the variance calculation, we divide by the number of inputs n in the variance formula. This approach does not appy Bessel's correction, which typically uses n-1 instead of n in the denominator to adjust for bias in sample variance estimation. 

Let's Now put our new layer into practice

In [10]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1,keepdim=True)
var = out_ln.var(dim=-1,unbiased=False,keepdim=True)
print("Mean\n",mean)
print("Var:\n",var)

Mean
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Var:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
