Positional Encoding

In [32]:
import numpy as np
import math
def positional_encoding(max_len, d_model):
    # max_len: độ dài tối đa của 1 chuỗi
    # d_model: kích thước embedding
    pe = np.zeros((max_len, d_model))
    for pos in range(max_len):
        for i in range(0, d_model, 2):
            angle = pos / (10000 ** (i / d_model))
            # Vị trí chẵn
            pe[pos, i] = math.sin(angle)
            # Vị trí lẻ
            pe[pos, i+1] = math.cos(angle)
    return pe



In [33]:
max_len = 5
d_model = 4
pe = np.zeros((5, 4))
print(f"ma trận ban đầu:\n{pe}")
print(f"------------------")
for pos in range(max_len):
    for i in range(0, d_model, 2):
        angle = pos / (10000 ** (i / d_model))
        pe[pos][i] = math.sin(angle)
        pe[pos][i+1] = math.cos(angle)
print(f"ma trận positional encoding:\n{pe}")
print(f"--------------------------")
for i in range(max_len):
    print(f"vector PE cho vị trí thứ: {i} là:\n{pe[i]}")
    print(f"------------------------------")


ma trận ban đầu:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
------------------
ma trận positional encoding:
[[ 0.          1.          0.          1.        ]
 [ 0.84147098  0.54030231  0.00999983  0.99995   ]
 [ 0.90929743 -0.41614684  0.01999867  0.99980001]
 [ 0.14112001 -0.9899925   0.0299955   0.99955003]
 [-0.7568025  -0.65364362  0.03998933  0.99920011]]
--------------------------
vector PE cho vị trí thứ: 0 là:
[0. 1. 0. 1.]
------------------------------
vector PE cho vị trí thứ: 1 là:
[0.84147098 0.54030231 0.00999983 0.99995   ]
------------------------------
vector PE cho vị trí thứ: 2 là:
[ 0.90929743 -0.41614684  0.01999867  0.99980001]
------------------------------
vector PE cho vị trí thứ: 3 là:
[ 0.14112001 -0.9899925   0.0299955   0.99955003]
------------------------------
vector PE cho vị trí thứ: 4 là:
[-0.7568025  -0.65364362  0.03998933  0.99920011]
------------------------------


Sau khi tính PE, ta cộng vào embedding ban đầu: X' = X + PE

In [34]:
# Giả sử vector X ban đầu có giá trị:
x = np.random.randn(max_len, d_model)
print(f"vector embedding của input:\n{x}")
x = x + pe
print(f"---------------------------------")
print(f"vector x khi thêm positional encoding:\n{x}")

vector embedding của input:
[[ 0.62113602  2.06516305 -1.37436111 -0.66050378]
 [-0.57817703  0.84638542  0.37200656 -0.6383473 ]
 [ 0.94573413 -1.80154467  0.71740834  0.83773453]
 [ 0.56026209 -1.22670534 -0.06983493  0.19584023]
 [ 0.47823442  1.20625307  0.73981294  0.1984902 ]]
---------------------------------
vector x khi thêm positional encoding:
[[ 0.62113602  3.06516305 -1.37436111  0.33949622]
 [ 0.26329395  1.38668772  0.38200639  0.3616027 ]
 [ 1.85503155 -2.21769151  0.73740701  1.83753453]
 [ 0.7013821  -2.21669783 -0.03983943  1.19539027]
 [-0.27856808  0.55260945  0.77980227  1.19769031]]


Multihead Attention

Residual and Normalization Layer

In [35]:
import torch
import torch.nn as nn
# Layer normalization
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        # gamma và beta
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased= False)

        x_hat = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * x_hat + self.beta

In [36]:
d_model = 5
gamma1 = nn.Parameter(torch.zeros(d_model))
gamma2 = torch.zeros(d_model)
print(f"gamma1:\n{gamma1}")
print(f"gamma2:\n{gamma2}")
print(f"---------------------------")
matrix = torch.zeros(3,4)
print(matrix)

gamma1:
Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)
gamma2:
tensor([0., 0., 0., 0., 0.])
---------------------------
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])


In [37]:
# Tạo input gồm 2 câu batch_size = 2
batch_size = 2
seq_len = 3
d_model = 5
x = torch.randn(batch_size,seq_len, d_model)
print(x)
mean = x.mean(dim=-1, keepdim=True)
print(f"Trung bình của từng hàng: \n{mean}")
var = x.var(dim = -1, keepdim=True, unbiased=False)
print(f"Phương sai của từng hàng:\n{var}")

tensor([[[ 0.7778, -0.3200, -0.1645, -1.4031,  0.7105],
         [ 0.0943, -1.0420,  0.4630, -0.3948, -0.2294],
         [ 0.6182, -0.3394,  0.1782, -2.8626, -0.3558]],

        [[-2.0792,  0.7995,  0.6266,  0.9787, -0.8089],
         [ 0.6942,  1.6814,  1.1254,  0.7986,  1.4140],
         [ 1.3712, -0.9537,  0.5689, -0.7108, -0.6038]]])
Trung bình của từng hàng: 
tensor([[[-0.0799],
         [-0.2218],
         [-0.5523]],

        [[-0.0967],
         [ 1.1427],
         [-0.0656]]])
Phương sai của từng hàng:
tensor([[[0.6352],
         [0.2543],
         [1.4650]],

        [[1.3841],
         [0.1367],
         [0.7923]]])


In [38]:
gamma = torch.randn(d_model)
print(f"gamma:\n{gamma}")
x_hat = (x - mean) / torch.sqrt(var + 1e-6)
print(f"Chuẩn hóa các giá trị của input:\n{x_hat}")
ans = gamma * x_hat + gamma
print(f"Đầu ra layernorm của input:\n{ans}")

gamma:
tensor([-0.0967, -0.8902,  0.3538,  1.0674,  3.1487])
Chuẩn hóa các giá trị của input:
tensor([[[ 1.0761, -0.3014, -0.1061, -1.6603,  0.9917],
         [ 0.6267, -1.6264,  1.3579, -0.3431, -0.0151],
         [ 0.9670,  0.1759,  0.6035, -1.9088,  0.1623]],

        [[-1.6852,  0.7618,  0.6148,  0.9140, -0.6054],
         [-1.2129,  1.4568, -0.0467, -0.9307,  0.7336],
         [ 1.6142, -0.9977,  0.7128, -0.7248, -0.6046]]])
Đầu ra layernorm của input:
tensor([[[-2.0079e-01, -6.2196e-01,  3.1625e-01, -7.0481e-01,  6.2712e+00],
         [-1.5733e-01,  5.5767e-01,  8.3424e-01,  7.0119e-01,  3.1012e+00],
         [-1.9025e-01, -1.0468e+00,  5.6732e-01, -9.7005e-01,  3.6598e+00]],

        [[ 6.6266e-02, -1.5684e+00,  5.7131e-01,  2.0431e+00,  1.2424e+00],
         [ 2.0592e-02, -2.1871e+00,  3.3727e-01,  7.3941e-02,  5.4586e+00],
         [-2.5284e-01, -2.0814e-03,  6.0601e-01,  2.9377e-01,  1.2450e+00]]])


In [39]:
# Sublayer: Feed forward network: 2 hidden layer
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        output = self.linear2(x)
        return output

In [40]:
d_model = 4
d_ff = 6
linear1 = nn.Linear(d_model, d_ff)
print(linear1)

Linear(in_features=4, out_features=6, bias=True)


In [41]:
# Residual connection Layernorm
class SublayerConnection(nn.Module):
    # y = layernorm(x + sublayer(x))
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        self.norm = LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, sublayer):
        output = self.norm(x + self.dropout(sublayer(x)))
        return output

In [42]:
dropout = nn.Dropout(0.3)
print(dropout)

Dropout(p=0.3, inplace=False)


In [43]:
# y = layernorm(x + sublayer(x))
# Giả sử input x là 1 câu có 4 từ, mỗi từ được mã hóa thành vector 6 chiều
# ma trận embedding
x = torch.randn(1, 4, 6)
print(f"input:\n{x}")
sublayer = FeedForward(d_model=6, d_ff=10)
sublayer_connection = SublayerConnection(6)
y = sublayer_connection(x, sublayer)
print(f"output của x khi đi qua residual + layer normalization:\n{y}")

input:
tensor([[[ 1.0276, -0.6607,  1.6914,  2.0618, -0.1686, -0.4777],
         [ 1.4260, -0.6819,  0.2424, -0.0052,  0.8084, -0.8598],
         [ 0.1272,  1.4094, -0.8495, -0.4998, -0.4231,  0.8964],
         [ 1.1959, -1.6187, -1.3929,  0.5662, -1.2227, -0.0459]]])
output của x khi đi qua residual + layer normalization:
tensor([[[ 0.4049, -1.3846,  0.9127,  1.4354, -0.4727, -0.8957],
         [ 1.6320, -1.1942, -0.0676, -0.0407,  0.7970, -1.1265],
         [ 0.0808,  1.4333, -1.3950, -0.7915, -0.4190,  1.0914],
         [ 1.5844, -0.9524, -1.1510,  0.7987, -0.6787,  0.3991]]],
       grad_fn=<AddBackward0>)


Cài đặt MHA

In [44]:
def softmax(x):
    # softmax theo hàng
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x/np.sum(e_x, axis=-1, keepdims=True)
def scaled_dot_product_attention(Q, K, V):
    # Q: (n, d_k) - n query vectors
    # K: (m, d_k) - m key vectors
    # V: (m, d_v) - m value vectors
    # Lấy ra d_k cho bước tính căn bậc 2
    d_k = Q.shape[1]
    # Tính score
    scores = np.matmul(Q, K.T) / np.sqrt(d_k)
    # Tính trọng số attention
    weights = softmax(scores)
    # Tổ hợp tuyến tính với V
    output = np.matmul(weights, V)
    print("DEBUG - return:", output.shape, weights.shape)
    return output, weights

In [45]:
class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        # d_model phải chia hết cho num_heads
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        # Khởi tạo ma trận trọng số
        self.W_Q = np.random.randn(d_model, d_model)
        self.W_K = np.random.randn(d_model, d_model)
        self.W_V = np.random.randn(d_model, d_model)
        # Ma trận trọng số được sử dụng cuối cùng
        self.W_O = np.random.randn(d_model, d_model)
    def split_heads(self, x):
        # x: (seq_len, d_model)
        # return: (num_heads, seq_len, d_k)
        seq_len = x.shape[0]
        # Tách X thành 1 mảng gồm seq_len ma trận, mỗi ma trận có num_heads hàng, d_k cột
        x = x.reshape(seq_len, self.num_heads, self.d_k)
    def combine_heads(self, x):
        # x: (num_heads, seq_len, d_k)
        # return: (seq_len, d_model)
        num_heads, seq_len, d_k = x.shape
        x = x.transpose(1, 0, 2).reshape(seq_len, self.d_model)
    def forward(self, Q, K, V):
        # linear projection
        Q_proj = Q @ self.W_Q
        K_proj = K @ self.W_K
        V_proj = V @ self.W_V
        # Chia thành nhiều head
        Q_heads = self.split_heads(Q_proj)
        K_heads = self.split_heads(K_proj)
        V_heads = self.split_heads(V_proj)
        # Attention trên từng head
        head_outputs = []
        for i in range(self.num_heads):
            out, weights = scaled_dot_product_attention(Q_heads[i], K_heads[i], V_heads[i])
            head_outputs.append(out)
         # Chuyển đổi sang numpy
        head_outputs = np.array(head_outputs)
        #Ghép lại các head
        concat_output = self.combine_heads(head_outputs)
        #Linear cuối
        output = np.matmul(concat_output, self.W_O)
        return output


Pipeline của Encoder trong transformer

In [46]:
import numpy as np
class EncoderLayer:
    def __init__(self, d_model, d_ff, num_heads, dropout=0.1):
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        # Residual + layer Normalize (2 lần: sau MHA và sau FFN)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

    def forward(self, x, mask=None):
        # self attention + residual và layer norm
        attn_ouput = self.self_attn.forward(x, x, x)
        x = self.norm(x + attn_ouput)

        # ffn + residual và layer norm
        ff_output = self.feed_forward.forward(x)
        x = self.norm2(x + ff_output)
        return x

In [47]:
seq_len, d_model = 5, 9
d_ff = 16
num_heads = 3
# input
x = np.random.randn(seq_len, d_model)
print(f"input:\n{x}")

input:
[[ 0.06882774  0.82809309  1.06931332 -1.06252672 -1.53268067  1.03814128
  -0.43039354 -2.2288991   0.80210487]
 [ 0.07738696  1.0420496   1.76247743 -0.58500801  1.87652543  0.58369817
  -0.33101837  0.85158475  0.58001963]
 [ 1.33636926  1.05904713 -0.35722422  1.69440999 -0.32165112 -0.95539771
   0.90607909 -0.05959185 -0.98546733]
 [-0.17041403  1.03521561 -0.90743032  1.31258955 -0.07040159 -1.56574003
  -0.50335999 -0.3210434  -1.08486514]
 [-0.58303137  0.77650346 -0.51738077 -0.06008071  0.2595841   1.00080186
   0.81948858  1.12151495  0.95443576]]


In [48]:
# encoder layer
encoder_layer = EncoderLayer(d_model, d_ff, num_heads=num_heads)
# Forward
output = encoder_layer.forward(x)
print(f"output:\n{output}")

TypeError: 'NoneType' object is not subscriptable