Positional Encoding

In [2]:
import numpy as np
import math
def positional_encoding(max_len, d_model):
    # max_len: độ dài tối đa của 1 chuỗi
    # d_model: kích thước embedding
    pe = np.zeros((max_len, d_model))
    for pos in range(max_len):
        for i in range(0, d_model, 2):
            angle = pos / (10000 ** (i / d_model))
            # Vị trí chẵn
            pe[pos, i] = math.sin(angle)
            # Vị trí lẻ
            pe[pos, i+1] = math.cos(angle)
    return pe



In [3]:
max_len = 5
d_model = 4
pe = np.zeros((5, 4))
print(f"ma trận ban đầu:\n{pe}")
print(f"------------------")
for pos in range(max_len):
    for i in range(0, d_model, 2):
        angle = pos / (10000 ** (i / d_model))
        pe[pos][i] = math.sin(angle)
        pe[pos][i+1] = math.cos(angle)
print(f"ma trận positional encoding:\n{pe}")
print(f"--------------------------")
for i in range(max_len):
    print(f"vector PE cho vị trí thứ: {i} là:\n{pe[i]}")
    print(f"------------------------------")


ma trận ban đầu:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
------------------
ma trận positional encoding:
[[ 0.          1.          0.          1.        ]
 [ 0.84147098  0.54030231  0.00999983  0.99995   ]
 [ 0.90929743 -0.41614684  0.01999867  0.99980001]
 [ 0.14112001 -0.9899925   0.0299955   0.99955003]
 [-0.7568025  -0.65364362  0.03998933  0.99920011]]
--------------------------
vector PE cho vị trí thứ: 0 là:
[0. 1. 0. 1.]
------------------------------
vector PE cho vị trí thứ: 1 là:
[0.84147098 0.54030231 0.00999983 0.99995   ]
------------------------------
vector PE cho vị trí thứ: 2 là:
[ 0.90929743 -0.41614684  0.01999867  0.99980001]
------------------------------
vector PE cho vị trí thứ: 3 là:
[ 0.14112001 -0.9899925   0.0299955   0.99955003]
------------------------------
vector PE cho vị trí thứ: 4 là:
[-0.7568025  -0.65364362  0.03998933  0.99920011]
------------------------------


Sau khi tính PE, ta cộng vào embedding ban đầu: X' = X + PE

In [4]:
# Giả sử vector X ban đầu có giá trị:
x = np.random.randn(max_len, d_model)
print(f"vector embedding của input:\n{x}")
x = x + pe
print(f"---------------------------------")
print(f"vector x khi thêm positional encoding:\n{x}")

vector embedding của input:
[[-0.21644298  0.78184617  1.0156912  -0.8887838 ]
 [ 0.61199426 -0.04717827 -0.1691698  -0.02770232]
 [ 0.61060641 -1.36513898  0.47786632 -0.0236784 ]
 [-0.62626462  0.1456579   0.42518529  0.29744258]
 [ 0.76555819 -0.09552212  0.39565961  0.81810394]]
---------------------------------
vector x khi thêm positional encoding:
[[-0.21644298  1.78184617  1.0156912   0.1112162 ]
 [ 1.45346524  0.49312403 -0.15916996  0.97224768]
 [ 1.51990383 -1.78128582  0.49786499  0.97612161]
 [-0.48514461 -0.84433459  0.45518079  1.29699262]
 [ 0.00875569 -0.74916574  0.43564894  1.81730405]]


Multihead Attention

Residual and Normalization Layer

In [5]:
import torch
import torch.nn as nn
# Layer normalization
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        # gamma và beta
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased= False)

        x_hat = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * x_hat + self.beta

In [6]:
d_model = 5
gamma1 = nn.Parameter(torch.zeros(d_model))
gamma2 = torch.zeros(d_model)
print(f"gamma1:\n{gamma1}")
print(f"gamma2:\n{gamma2}")
print(f"---------------------------")
matrix = torch.zeros(3,4)
print(matrix)

gamma1:
Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)
gamma2:
tensor([0., 0., 0., 0., 0.])
---------------------------
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])


In [7]:
# Tạo input gồm 2 câu batch_size = 2
batch_size = 2
seq_len = 3
d_model = 5
x = torch.randn(batch_size,seq_len, d_model)
print(x)
mean = x.mean(dim=-1, keepdim=True)
print(f"Trung bình của từng hàng: \n{mean}")
var = x.var(dim = -1, keepdim=True, unbiased=False)
print(f"Phương sai của từng hàng:\n{var}")

tensor([[[ 0.1762,  1.4007,  0.5833,  0.5168,  0.5854],
         [ 0.4201,  0.3855,  0.1350, -1.1465, -1.2245],
         [-0.7071,  0.3370, -1.4839, -0.9447,  0.2987]],

        [[ 1.8559, -0.0228, -0.4383, -0.1640, -1.1650],
         [ 0.8335, -1.3620,  0.5914, -0.1554,  0.0164],
         [ 0.3649, -0.1550, -1.7266,  1.4740, -1.5907]]])
Trung bình của từng hàng: 
tensor([[[ 0.6525],
         [-0.2861],
         [-0.5000]],

        [[ 0.0132],
         [-0.0152],
         [-0.3267]]])
Phương sai của từng hàng:
tensor([[[0.1629],
         [0.5496],
         [0.5094]],

        [[1.0041],
         [0.5845],
         [1.4615]]])


In [8]:
gamma = torch.randn(d_model)
print(f"gamma:\n{gamma}")
x_hat = (x - mean) / torch.sqrt(var + 1e-6)
print(f"Chuẩn hóa các giá trị của input:\n{x_hat}")
ans = gamma * x_hat + gamma
print(f"Đầu ra layernorm của input:\n{ans}")

gamma:
tensor([ 1.0498,  2.7358, -0.9048,  0.3726, -1.2780])
Chuẩn hóa các giá trị của input:
tensor([[[-1.1800,  1.8540, -0.1714, -0.3363, -0.1663],
         [ 0.9526,  0.9059,  0.5680, -1.1606, -1.2658],
         [-0.2902,  1.1727, -1.3785, -0.6231,  1.1190]],

        [[ 1.8390, -0.0359, -0.4505, -0.1768, -1.1758],
         [ 1.1101, -1.7615,  0.7934, -0.1834,  0.0414],
         [ 0.5720,  0.1420, -1.1579,  1.4895, -1.0456]]])
Đầu ra layernorm của input:
tensor([[[-0.1890,  7.8081, -0.7497,  0.2473, -1.0655],
         [ 2.0498,  5.2141, -1.4187, -0.0599,  0.3397],
         [ 0.7452,  5.9441,  0.3424,  0.1405, -2.7081]],

        [[ 2.9805,  2.6375, -0.4972,  0.3067,  0.2247],
         [ 2.2152, -2.0833, -1.6227,  0.3043, -1.3309],
         [ 1.6504,  3.1243,  0.1429,  0.9276,  0.0582]]])


In [9]:
# Sublayer: Feed forward network: 2 hidden layer
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        output = self.linear2(x)
        return output

In [10]:
d_model = 4
d_ff = 6
linear1 = nn.Linear(d_model, d_ff)
print(linear1)

Linear(in_features=4, out_features=6, bias=True)


In [11]:
# Residual connection Layernorm
class SublayerConnection(nn.Module):
    # y = layernorm(x + sublayer(x))
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        self.norm = LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, sublayer):
        output = self.norm(x + self.dropout(sublayer(x)))
        return output

In [12]:
dropout = nn.Dropout(0.3)
print(dropout)

Dropout(p=0.3, inplace=False)


In [13]:
# y = layernorm(x + sublayer(x))
# Giả sử input x là 1 câu có 4 từ, mỗi từ được mã hóa thành vector 6 chiều
# ma trận embedding
x = torch.randn(1, 4, 6)
print(f"input:\n{x}")
sublayer = FeedForward(d_model=6, d_ff=10)
sublayer_connection = SublayerConnection(6)
y = sublayer_connection(x, sublayer)
print(f"output của x khi đi qua residual + layer normalization:\n{y}")

input:
tensor([[[-0.5415, -0.1180,  1.8667, -1.2576, -1.2498,  1.0872],
         [-1.3945, -0.5030, -0.3373, -1.9713, -1.6880, -0.5745],
         [-0.5466, -0.8399, -0.2512, -1.1144,  0.2116, -0.9814],
         [-1.3205,  0.8312,  1.1146,  1.0951,  0.3491,  0.1112]]])
output của x khi đi qua residual + layer normalization:
tensor([[[-0.6559,  0.0169,  1.6040, -1.0853, -0.8916,  1.0119],
         [-0.8976,  0.7418,  1.3452, -1.0599, -0.9874,  0.8579],
         [-0.8217, -0.2124,  1.6676, -1.2697,  0.9030, -0.2669],
         [-2.0583,  0.8976,  0.0138,  0.9534,  0.2188, -0.0253]]],
       grad_fn=<AddBackward0>)


Cài đặt MHA

In [14]:
def softmax(x):
    # softmax theo hàng
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x/np.sum(e_x, axis=-1, keepdims=True)
def scaled_dot_product_attention(Q, K, V):
    # Q: (n, d_k) - n query vectors
    # K: (m, d_k) - m key vectors
    # V: (m, d_v) - m value vectors
    # Lấy ra d_k cho bước tính căn bậc 2
    d_k = Q.shape[1]
    # Tính score
    scores = np.matmul(Q, K.T) / np.sqrt(d_k)
    # Tính trọng số attention
    weights = softmax(scores)
    # Tổ hợp tuyến tính với V
    output = np.matmul(weights, V)
    print("DEBUG - return:", output.shape, weights.shape)
    return output, weights

In [15]:
q = np.random.randn(4, 4)
k = np.random.randn(4, 4)
v = np.random.randn(4, 4)
ans, weight = scaled_dot_product_attention(q, k , v)
print(ans)

DEBUG - return: (4, 4) (4, 4)
[[-0.93752795  0.25275933 -0.46457028 -0.09316837]
 [-0.10914321  0.58239979 -1.60843841  0.39956429]
 [-0.28380045  0.06269882 -0.64974329 -0.25192324]
 [-0.08112379  0.64395618 -1.55195868  0.356688  ]]


In [16]:
class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        # d_model phải chia hết cho num_heads
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        # Khởi tạo ma trận trọng số
        self.W_Q = np.random.randn(d_model, d_model)
        self.W_K = np.random.randn(d_model, d_model)
        self.W_V = np.random.randn(d_model, d_model)
        # Ma trận trọng số được sử dụng cuối cùng
        self.W_O = np.random.randn(d_model, d_model)
    def split_heads(self, x):
        # x: (seq_len, d_model)
        # return: (num_heads, seq_len, d_k)
        seq_len = x.shape[0]
        # Tách X thành 1 mảng gồm seq_len ma trận, mỗi ma trận có num_heads hàng, d_k cột
        x = x.reshape(seq_len, self.num_heads, self.d_k)
        x = x.transpose(1, 0, 2)
        return x
    def combine_heads(self, x):
        # x: (num_heads, seq_len, d_k)
        # return: (seq_len, d_model)
        num_heads, seq_len, d_k = x.shape
        x = x.transpose(1, 0, 2).reshape(seq_len, self.d_model)
        return x
    def forward(self, Q, K, V):
        # linear projection
        Q_proj = Q @ self.W_Q
        K_proj = K @ self.W_K
        V_proj = V @ self.W_V
        # Chia thành nhiều head
        Q_heads = self.split_heads(Q_proj)
        K_heads = self.split_heads(K_proj)
        V_heads = self.split_heads(V_proj)
        # Attention trên từng head
        head_outputs = []
        for i in range(self.num_heads):
            out, weights = scaled_dot_product_attention(Q_heads[i], K_heads[i], V_heads[i])
            head_outputs.append(out)
         # Chuyển đổi sang numpy
        head_outputs = np.array(head_outputs)
        #Ghép lại các head
        concat_output = self.combine_heads(head_outputs)
        #Linear cuối
        output = np.matmul(concat_output, self.W_O)
        return output


Pipeline của Encoder trong transformer

In [17]:
import numpy as np
class EncoderLayer:
    def __init__(self, d_model, d_ff, num_heads, dropout=0.1):
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        # Residual + layer Normalize (2 lần: sau MHA và sau FFN)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

    def forward(self, x, mask=None):
        # self attention + residual và layer norm
        attn_ouput = self.self_attn.forward(x, x, x)
        x = torch.from_numpy(x).float()
        attn_ouput = torch.from_numpy(attn_ouput).float()
        x = self.norm1(x + attn_ouput)

        # ffn + residual và layer norm
        ff_output = self.feed_forward.forward(x)
        x = self.norm2(x + ff_output)
        return x

In [18]:
seq_len, d_model = 5, 9
d_ff = 16
num_heads = 3
# input
x = np.random.randn(seq_len, d_model)
print(f"input:\n{x}")

input:
[[ 0.50802976  0.23444936 -0.59159537 -0.36601918  1.67046587  0.75439201
   0.41214895 -1.89675187  1.12272172]
 [-0.09154087 -1.23318577  0.60780356 -1.4386419   0.85144728 -0.42111865
   1.38579297 -1.59434061  0.61999749]
 [ 0.28131394  1.45682223  0.26959276 -0.958653   -0.9690516  -0.67968528
  -0.15747996 -0.10191642 -0.85026526]
 [ 0.59283022 -0.34514973 -1.35365388 -0.12678413  0.55361445 -0.79478839
   0.83515877  0.38724302 -0.68556264]
 [ 1.10614352  0.65093095 -1.79333279 -0.30305034  0.14238003 -1.6305611
   0.03470773  0.87840114  0.16645505]]


In [19]:
# encoder layer
encoder_layer = EncoderLayer(d_model, d_ff, num_heads=num_heads)
# Forward
output = encoder_layer.forward(x)
print(f"output:\n{output}")

DEBUG - return: (5, 3) (5, 5)
DEBUG - return: (5, 3) (5, 5)
DEBUG - return: (5, 3) (5, 5)
output:
tensor([[ 0.0910,  0.0656, -0.5387, -1.2596,  0.4712,  1.7055, -0.2591,  1.2539,
         -1.5299],
        [ 1.1639, -0.6475, -1.4581,  0.0252,  0.2831,  0.4773, -0.0754,  1.6583,
         -1.4267],
        [ 0.9327, -0.8510, -1.4318,  0.3775, -0.3850,  0.2372,  0.3524,  1.8989,
         -1.1308],
        [-1.3812,  0.8436,  0.7653, -0.9898,  0.4205,  0.7697,  0.7205, -1.7717,
          0.6230],
        [ 1.0025, -0.8041, -1.6736,  1.5121,  0.1125, -1.3164,  0.5469,  0.1999,
          0.4203]], grad_fn=<AddBackward0>)


Encoder

In [20]:
# Masked Attention
import torch
import torch.nn.functional as F
def masked_scaled_dot_product_attention(Q, K, V, mask= None):
    # Q: (batch, seq_len, d_k)
    # K: (batch, seq_len, d_k)
    # V: (batch, seq_len, d_v)
    d_k = Q.size(-1)
    # Tính scores
    scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype= torch.float32))
    if mask is not None:
        # mask = 0 => j <= i
        # mask = -inf => j > i
        scores = scores + mask
    # softmax theo chiều seq_len
    weights = F.softmax(scores, dim= -1)
    # (batch, seq_len, d_v)
    output = torch.matmul(weights, V)
    return output, weights
def generate_subsequent_mask(seq_len):
    # Tạo mask tam giác dưới: (seq_len, seq_len)
    # 1. Tạo ma trận toàn 1 (seq_len, seq_len)
    mask = torch.triu(torch.ones(seq_len, seq_len), diagnol= 1)
    # 2. Các phần tử bằng 1 được thay bằng -inf
    mask = mask.masked_fill(mask == 1, float('-inf'))
    return mask

In [24]:
d_k, d_v = 4, 8
batch_size, seq_len = 2, 5
Q = torch.randn(batch_size, seq_len, d_k)
K = torch.randn(batch_size, seq_len, d_k)
V = torch.randn(batch_size, seq_len, d_v)
Q.size()

torch.Size([2, 5, 4])