In [17]:
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import math

In [4]:
sequence_length = 4
batch_size = 1
input_dim = 512
d_model = 512
x = torch.randn((batch_size, sequence_length, input_dim))

In [5]:
x.size()

torch.Size([1, 4, 512])

In [6]:
qkv_layer = nn.Linear(input_dim, 3 * d_model)

In [10]:
qkv = qkv_layer(x)
qkv.size()

torch.Size([1, 4, 1536])

In [13]:
num_heads = 8
head_dim = d_model // num_heads
qkv = qkv.reshape(batch_size, sequence_length, num_heads, 3 * head_dim)
qkv = qkv.permute(0, 2, 1, 3)
qkv.shape

torch.Size([1, 8, 4, 192])

In [15]:
q, k, v = qkv.chunk(3, dim=-1)
q.size(), k.size(), v.size()

(torch.Size([1, 8, 4, 64]),
 torch.Size([1, 8, 4, 64]),
 torch.Size([1, 8, 4, 64]))

In [18]:
d_k = q.size()[-1]
scaled = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
scaled.size()

torch.Size([1, 8, 4, 4])

In [21]:
mask = torch.full(scaled.size(), float('-inf'))
mask = torch.triu(mask, diagonal=1)
mask[0][0]

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

In [33]:
(scaled+mask)[0][0]

tensor([[ 0.5285,    -inf,    -inf,    -inf],
        [ 0.0594, -0.0691,    -inf,    -inf],
        [-0.4009,  0.2142,  0.2983,    -inf],
        [-0.2991, -0.1059,  0.0385, -0.3227]], grad_fn=<SelectBackward0>)

In [35]:
np.exp(0.0594) / (np.exp(0.0594) + np.exp(-0.0691))

0.532080868200827

In [30]:
scaled += mask
attention = F.softmax(scaled, dim=-1)

In [31]:
attention.shape

torch.Size([1, 8, 4, 4])

In [36]:
values = torch.matmul(attention, v)
values.shape

torch.Size([1, 8, 4, 64])

In [37]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.shape[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask != None:
        scaled += mask
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [40]:
values, attention = scaled_dot_product(q, k, v, mask=mask)
attention.shape
attention[0][0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5321, 0.4679, 0.0000, 0.0000],
        [0.2057, 0.3805, 0.4138, 0.0000],
        [0.2178, 0.2642, 0.3053, 0.2127]], grad_fn=<SelectBackward0>)

In [45]:
class MultiheadAttention(nn.Module):
    def __init__(self, input_dim, d_model, num_heads):
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(input_dim, 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, sequence_length, input_dim = x.shape
        print(f"x.shape: {x.shape}")
        qkv = self.qkv_layer(x)
        print(f"qkv.shape: {qkv.shape}")
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        print(f"qkv.shape: {qkv.shape}")
        qkv = qkv.permute(0, 2, 1, 3)
        print(f"qkv.shape: {qkv.shape}")
        q, k, v = qkv.chunk(3, dim=-1)
        print(f"q.shape: {q.shape} k.shape: {q.shape}, v.shape: {q.shape}")
        values, attention = scaled_dot_product(q, k, v, mask)
        print(f"values.shape: {values.shape}, attention.shape: {attention.shape}")
        values = values.reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        print(f"values.shape: {values.shape}")
        out = self.linear_layer(values)
        print(f"out.shape: {out.shape}")
        return out

In [46]:
input_dim = 1024
d_model = 512
num_heads = 8

batch_size = 30
sequence_length = 10
x = torch.randn((batch_size, sequence_length, input_dim))

model = MultiheadAttention(input_dim, d_model, num_heads)
out = model.forward(x)

x.shape: torch.Size([30, 10, 1024])
qkv.shape: torch.Size([30, 10, 1536])
qkv.shape: torch.Size([30, 10, 8, 192])
qkv.shape: torch.Size([30, 8, 10, 192])
q.shape: torch.Size([30, 8, 10, 64]) k.shape: torch.Size([30, 8, 10, 64]), v.shape: torch.Size([30, 8, 10, 64])
values.shape: torch.Size([30, 8, 10, 64]), attention.shape: torch.Size([30, 8, 10, 10])
values.shape: torch.Size([30, 10, 512])
out.shape: torch.Size([30, 10, 512])


### Positional Encoding

In [53]:
max_sequence_length = 10
d_model = 6

even_i = torch.arange(0, d_model, 2).float()
even_denominator = torch.pow(10000, even_i/d_model)
print(even_denominator)

odd_i = torch.arange(1, d_model, 2).float()
print(odd_i)
odd_denominator = torch.pow(10000, (odd_i-1)/d_model)
print(odd_denominator)

tensor([  1.0000,  21.5443, 464.1590])
tensor([1., 3., 5.])
tensor([  1.0000,  21.5443, 464.1590])


In [54]:
position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1)
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])

In [60]:
denominator = even_denominator
a = position / denominator
print(a[1])
even_PE = torch.sin(position / denominator)
odd_PE = torch.cos(position / denominator)
odd_PE.shape

tensor([1.0000, 0.0464, 0.0022])


torch.Size([10, 3])

In [72]:
stacked = torch.stack([even_PE, odd_PE], dim=2)
stacked.shape
print(stacked[1])

tensor([[0.8415, 0.5403],
        [0.0464, 0.9989],
        [0.0022, 1.0000]])


In [75]:
PE = stacked.flatten(start_dim=1, end_dim=2)
PE

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

In [76]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [77]:
pe = PositionalEncoding(d_model=6, max_sequence_length=10)
pe.forward()

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

In [80]:
inputs = torch.Tensor([[[0.2, 0.1, 0.3], [0.5, 0.1, 0.1]]])
batch_size, sequence_length, embedding_length = inputs.shape
inputs = inputs.reshape(sequence_length, batch_size, embedding_length)
inputs.shape

torch.Size([2, 1, 3])

### Layer Normalization

In [81]:
parameter_shape = inputs.shape[-2:]
gamma = nn.Parameter(torch.ones(parameter_shape))
beta = nn.Parameter(torch.zeros(parameter_shape))


In [82]:
gamma.shape, beta.shape

(torch.Size([1, 3]), torch.Size([1, 3]))

In [83]:
dims = [-(i + 1) for i in range((len(parameter_shape)))]
dims

[-1, -2]

In [85]:
print(inputs)
mean = inputs.mean(dim=dims, keepdim=True)
mean

tensor([[[0.2000, 0.1000, 0.3000]],

        [[0.5000, 0.1000, 0.1000]]])


tensor([[[0.2000]],

        [[0.2333]]])

In [87]:
var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
print(var)
eps = 1e-5
std = (var + eps).sqrt()
std

tensor([[[0.0067]],

        [[0.0356]]])


tensor([[[0.0817]],

        [[0.1886]]])

In [88]:
y = (inputs - mean) / std
print(y)
out = gamma * y + beta
print(out)

tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]])
tensor([[[ 0.0000, -1.2238,  1.2238]],

        [[ 1.4140, -0.7070, -0.7070]]], grad_fn=<AddBackward0>)


In [96]:
class LayerNormalization():
    def __init__(self, parameters_shape, eps=1e-5):
        self.parameters_shape = parameters_shape
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta = nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, input):
        dims = [-(i+1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        print(f"Mean shape: ({mean.shape}) \n {mean}")
        var = ((input - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        print(f"Standard deviation shape ({std.shape}) \n {std}")
        y = (input - mean) / std
        out = self.gamma * y + self.beta
        print(f"out shape ({out.shape}) \n {out}")
        return out

In [100]:
batch_size = 2
sequence_length = 3
embedding_length = 6
inputs = torch.randn(sequence_length, batch_size, embedding_length)
print(inputs.shape)
print(inputs)

torch.Size([3, 2, 6])
tensor([[[ 0.0760, -0.3607,  2.2615,  0.3486, -0.3669, -1.2119],
         [ 1.4088, -0.3075, -0.8317, -0.1709,  0.3680,  0.0418]],

        [[-0.7911,  0.1768,  0.4920, -0.9347,  0.9171,  0.9248],
         [-0.6117,  0.1830, -0.3850,  1.1610,  0.4953, -0.5414]],

        [[ 0.3171, -0.4679,  0.2267,  1.0981, -1.0797, -0.4287],
         [ 1.8826,  0.5825, -0.1983,  0.9911,  0.3762,  1.7467]]])


In [101]:
layer_norm = LayerNormalization(inputs.shape[-2:])
out = layer_norm.forward(inputs)

Mean shape: (torch.Size([3, 1, 1])) 
 tensor([[[0.1046]],

        [[0.0905]],

        [[0.4205]]])
Standard deviation shape (torch.Size([3, 1, 1])) 
 tensor([[[0.9028]],

        [[0.6960]],

        [[0.8613]]])
out shape (torch.Size([3, 2, 6])) 
 tensor([[[-0.0316, -0.5154,  2.3890,  0.2703, -0.5222, -1.4581],
         [ 1.4446, -0.4565, -1.0371, -0.3052,  0.2918, -0.0695]],

        [[-1.2667,  0.1239,  0.5769, -1.4730,  1.1875,  1.1987],
         [-1.0088,  0.1329, -0.6832,  1.5380,  0.5816, -0.9079]],

        [[-0.1202, -1.0315, -0.2251,  0.7867, -1.7418, -0.9860],
         [ 1.6976,  0.1880, -0.7184,  0.6624, -0.0514,  1.5398]]],
       grad_fn=<AddBackward0>)
