Positional Encoding

In [78]:
import numpy as np
import math
def positional_encoding(max_len, d_model):
    # max_len: độ dài tối đa của 1 chuỗi
    # d_model: kích thước embedding
    pe = np.zeros((max_len, d_model))
    for pos in range(max_len):
        for i in range(0, d_model, 2):
            angle = pos / (10000 ** (i / d_model))
            # Vị trí chẵn
            pe[pos, i] = math.sin(angle)
            # Vị trí lẻ
            pe[pos, i+1] = math.cos(angle)
    return pe



In [79]:
max_len = 5
d_model = 4
pe = np.zeros((5, 4))
print(f"ma trận ban đầu:\n{pe}")
print(f"------------------")
for pos in range(max_len):
    for i in range(0, d_model, 2):
        angle = pos / (10000 ** (i / d_model))
        pe[pos][i] = math.sin(angle)
        pe[pos][i+1] = math.cos(angle)
print(f"ma trận positional encoding:\n{pe}")
print(f"--------------------------")
for i in range(max_len):
    print(f"vector PE cho vị trí thứ: {i} là:\n{pe[i]}")
    print(f"------------------------------")


ma trận ban đầu:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
------------------
ma trận positional encoding:
[[ 0.          1.          0.          1.        ]
 [ 0.84147098  0.54030231  0.00999983  0.99995   ]
 [ 0.90929743 -0.41614684  0.01999867  0.99980001]
 [ 0.14112001 -0.9899925   0.0299955   0.99955003]
 [-0.7568025  -0.65364362  0.03998933  0.99920011]]
--------------------------
vector PE cho vị trí thứ: 0 là:
[0. 1. 0. 1.]
------------------------------
vector PE cho vị trí thứ: 1 là:
[0.84147098 0.54030231 0.00999983 0.99995   ]
------------------------------
vector PE cho vị trí thứ: 2 là:
[ 0.90929743 -0.41614684  0.01999867  0.99980001]
------------------------------
vector PE cho vị trí thứ: 3 là:
[ 0.14112001 -0.9899925   0.0299955   0.99955003]
------------------------------
vector PE cho vị trí thứ: 4 là:
[-0.7568025  -0.65364362  0.03998933  0.99920011]
------------------------------


Sau khi tính PE, ta cộng vào embedding ban đầu: X' = X + PE

In [80]:
# Giả sử vector X ban đầu có giá trị:
x = np.random.randn(max_len, d_model)
print(f"vector embedding của input:\n{x}")
x = x + pe
print(f"---------------------------------")
print(f"vector x khi thêm positional encoding:\n{x}")

vector embedding của input:
[[-2.51048054  0.91427526  0.38544417 -1.15885041]
 [ 0.1372257   0.11977357 -0.25662949 -0.51509956]
 [-2.48507099 -2.04366644 -0.92723553  0.24973802]
 [-0.40296381 -0.20171841  1.82388598  1.63357139]
 [-0.31936225  1.30005329 -0.19187889  0.0853536 ]]
---------------------------------
vector x khi thêm positional encoding:
[[-2.51048054  1.91427526  0.38544417 -0.15885041]
 [ 0.97869668  0.66007587 -0.24662966  0.48485044]
 [-1.57577356 -2.45981328 -0.90723687  1.24953802]
 [-0.2618438  -1.19171091  1.85388148  2.63312142]
 [-1.07616475  0.64640967 -0.15188955  1.08455371]]


Multihead Attention

Residual and Normalization Layer

In [81]:
import torch
import torch.nn as nn
# Layer normalization
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        # gamma và beta
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased= False)

        x_hat = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * x_hat + self.beta

In [82]:
d_model = 5
gamma1 = nn.Parameter(torch.zeros(d_model))
gamma2 = torch.zeros(d_model)
print(f"gamma1:\n{gamma1}")
print(f"gamma2:\n{gamma2}")
print(f"---------------------------")
matrix = torch.zeros(3,4)
print(matrix)

gamma1:
Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)
gamma2:
tensor([0., 0., 0., 0., 0.])
---------------------------
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])


In [83]:
# Tạo input gồm 2 câu batch_size = 2
batch_size = 2
seq_len = 3
d_model = 5
x = torch.randn(batch_size,seq_len, d_model)
print(x)
mean = x.mean(dim=-1, keepdim=True)
print(f"Trung bình của từng hàng: \n{mean}")
var = x.var(dim = -1, keepdim=True, unbiased=False)
print(f"Phương sai của từng hàng:\n{var}")

tensor([[[-1.5954, -1.0103,  0.0845,  0.7275,  0.0063],
         [ 0.2819, -2.6054, -0.3287, -1.2705, -2.9631],
         [ 1.1723, -0.9449,  1.3273,  1.0438, -1.9599]],

        [[ 1.6810, -0.8743, -1.0050,  0.3031, -0.1766],
         [-0.5185, -0.0095,  0.7464,  1.5546, -0.7502],
         [ 0.0041, -0.7962,  1.2341,  0.0677, -0.5324]]])
Trung bình của từng hàng: 
tensor([[[-0.3575],
         [-1.3772],
         [ 0.1277]],

        [[-0.0144],
         [ 0.2045],
         [-0.0045]]])
Phương sai của từng hàng:
tensor([[[0.6927],
         [1.5774],
         [1.7756]],

        [[0.9444],
         [0.7193],
         [0.4890]]])


In [84]:
gamma = torch.randn(d_model)
print(f"gamma:\n{gamma}")
x_hat = (x - mean) / torch.sqrt(var + 1e-6)
print(f"Chuẩn hóa các giá trị của input:\n{x_hat}")
ans = gamma * x_hat + gamma
print(f"Đầu ra layernorm của input:\n{ans}")

gamma:
tensor([-0.1320,  0.4723,  1.3455,  0.8358, -1.9937])
Chuẩn hóa các giá trị của input:
tensor([[[-1.4874, -0.7843,  0.5311,  1.3036,  0.4370],
         [ 1.3210, -0.9780,  0.8348,  0.0849, -1.2627],
         [ 0.7839, -0.8050,  0.9003,  0.6875, -1.5667]],

        [[ 1.7445, -0.8849, -1.0194,  0.3266, -0.1669],
         [-0.8526, -0.2524,  0.6389,  1.5918, -1.1258],
         [ 0.0124, -1.1321,  1.7713,  0.1033, -0.7549]]])
Đầu ra layernorm của input:
tensor([[[ 0.0643,  0.1019,  2.0601,  1.9254, -2.8650],
         [-0.3064,  0.0104,  2.4687,  0.9068,  0.5238],
         [-0.2355,  0.0921,  2.5568,  1.4104,  1.1297]],

        [[-0.3623,  0.0544, -0.0261,  1.1088, -1.6609],
         [-0.0195,  0.3531,  2.2051,  2.1663,  0.2508],
         [-0.1336, -0.0624,  3.7289,  0.9221, -0.4887]]])


In [85]:
# Sublayer: Feed forward network: 2 hidden layer
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        output = self.linear2(x)
        return output

In [86]:
d_model = 4
d_ff = 6
linear1 = nn.Linear(d_model, d_ff)
print(linear1)

Linear(in_features=4, out_features=6, bias=True)


In [87]:
# Residual connection Layernorm
class SublayerConnection(nn.Module):
    # y = layernorm(x + sublayer(x))
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        self.norm = LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, sublayer):
        output = self.norm(x + self.dropout(sublayer(x)))
        return output

In [88]:
dropout = nn.Dropout(0.3)
print(dropout)

Dropout(p=0.3, inplace=False)


In [89]:
# y = layernorm(x + sublayer(x))
# Giả sử input x là 1 câu có 4 từ, mỗi từ được mã hóa thành vector 6 chiều
# ma trận embedding
x = torch.randn(1, 4, 6)
print(f"input:\n{x}")
sublayer = FeedForward(d_model=6, d_ff=10)
sublayer_connection = SublayerConnection(6)
y = sublayer_connection(x, sublayer)
print(f"output của x khi đi qua residual + layer normalization:\n{y}")

input:
tensor([[[ 0.3565,  0.9443, -1.1438, -0.2553, -1.4977, -0.3394],
         [ 0.0317, -1.2267, -1.8146,  0.2450,  0.4963, -0.2686],
         [-0.0608,  0.9713, -0.1718,  0.7234,  0.2029,  0.8809],
         [-0.8522,  0.6999,  1.6019,  2.5095,  0.4683, -1.1485]]])
output của x khi đi qua residual + layer normalization:
tensor([[[ 6.4081e-01,  1.6274e+00, -7.7806e-01, -1.7715e-01, -1.5057e+00,
           1.9266e-01],
         [ 3.5039e-01, -9.5322e-01, -1.7534e+00,  7.3252e-01,  9.5247e-01,
           6.7120e-01],
         [-1.0468e+00,  1.1493e+00, -1.2594e+00, -9.7621e-02, -1.4764e-01,
           1.4021e+00],
         [-1.0436e+00, -6.4312e-05,  1.0593e+00,  1.3934e+00, -5.0756e-02,
          -1.3582e+00]]], grad_fn=<AddBackward0>)


Cài đặt MHA

In [90]:
def softmax(x):
    # softmax theo hàng
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x/np.sum(e_x, axis=-1, keepdims=True)
def scaled_dot_product_attention(Q, K, V):
    # Q: (n, d_k) - n query vectors
    # K: (m, d_k) - m key vectors
    # V: (m, d_v) - m value vectors
    # Lấy ra d_k cho bước tính căn bậc 2
    d_k = Q.shape[1]
    # Tính score
    scores = np.matmul(Q, K.T) / np.sqrt(d_k)
    # Tính trọng số attention
    weights = softmax(scores)
    # Tổ hợp tuyến tính với V
    output = np.matmul(weights, V)
    print("DEBUG - return:", output.shape, weights.shape)
    return output, weights

In [91]:
q = np.random.randn(4, 4)
k = np.random.randn(4, 4)
v = np.random.randn(4, 4)
ans, weight = scaled_dot_product_attention(q, k , v)
print(ans)

DEBUG - return: (4, 4) (4, 4)
[[ 0.31239633  0.09678679 -0.68516955  1.13982061]
 [ 0.24402395 -0.19830626 -0.75061639  0.99637655]
 [-0.23419394 -0.36439287 -0.31383449  0.85832369]
 [-0.22604182 -0.41325785 -0.21459471  0.63481584]]


In [92]:
class MultiHeadAttention:
    def __init__(self, d_model, num_heads):
        # d_model phải chia hết cho num_heads
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        # Khởi tạo ma trận trọng số
        self.W_Q = np.random.randn(d_model, d_model)
        self.W_K = np.random.randn(d_model, d_model)
        self.W_V = np.random.randn(d_model, d_model)
        # Ma trận trọng số được sử dụng cuối cùng
        self.W_O = np.random.randn(d_model, d_model)
    def split_heads(self, x):
        # x: (seq_len, d_model)
        # return: (num_heads, seq_len, d_k)
        seq_len = x.shape[0]
        # Tách X thành 1 mảng gồm seq_len ma trận, mỗi ma trận có num_heads hàng, d_k cột
        x = x.reshape(seq_len, self.num_heads, self.d_k)
        x = x.transpose(1, 0, 2)
        return x
    def combine_heads(self, x):
        # x: (num_heads, seq_len, d_k)
        # return: (seq_len, d_model)
        num_heads, seq_len, d_k = x.shape
        x = x.transpose(1, 0, 2).reshape(seq_len, self.d_model)
        return x
    def forward(self, Q, K, V):
        # linear projection
        Q_proj = Q @ self.W_Q
        K_proj = K @ self.W_K
        V_proj = V @ self.W_V
        # Chia thành nhiều head
        Q_heads = self.split_heads(Q_proj)
        K_heads = self.split_heads(K_proj)
        V_heads = self.split_heads(V_proj)
        # Attention trên từng head
        head_outputs = []
        for i in range(self.num_heads):
            out, weights = scaled_dot_product_attention(Q_heads[i], K_heads[i], V_heads[i])
            head_outputs.append(out)
         # Chuyển đổi sang numpy
        head_outputs = np.array(head_outputs)
        #Ghép lại các head
        concat_output = self.combine_heads(head_outputs)
        #Linear cuối
        output = np.matmul(concat_output, self.W_O)
        return output


Pipeline của Encoder trong transformer

In [93]:
import numpy as np
class EncoderLayer:
    def __init__(self, d_model, d_ff, num_heads, dropout=0.1):
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        # Residual + layer Normalize (2 lần: sau MHA và sau FFN)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

    def forward(self, x, mask=None):
        # self attention + residual và layer norm
        attn_ouput = self.self_attn.forward(x, x, x)
        x = torch.from_numpy(x).float()
        attn_ouput = torch.from_numpy(attn_ouput).float()
        x = self.norm1(x + attn_ouput)

        # ffn + residual và layer norm
        ff_output = self.feed_forward.forward(x)
        x = self.norm2(x + ff_output)
        return x

In [94]:
seq_len, d_model = 5, 9
d_ff = 16
num_heads = 3
# input
x = np.random.randn(seq_len, d_model)
print(f"input:\n{x}")

input:
[[ 0.77472802 -0.08647975 -1.45733034  0.74714996 -0.77795334 -0.49143968
  -0.58541248  0.35992965  0.60493472]
 [ 1.43347545 -0.33980881  1.87000663 -1.05037168  0.79688901 -1.33239766
  -1.62198809 -0.48038467  1.2206296 ]
 [ 0.77125802  1.93978138  0.19991508 -0.53569585 -1.51229057 -1.20003037
   0.39999264 -0.22251599 -0.1348209 ]
 [-1.1684963  -0.26625375 -1.12345459 -1.07408403  0.2065336   1.14968316
   0.14735544 -1.31236736 -1.63588371]
 [-1.19218815  0.87773835 -1.23193794  1.35666971  1.23455314  1.28359339
  -1.7580598  -0.55558014  1.41733379]]


In [95]:
# encoder layer
encoder_layer = EncoderLayer(d_model, d_ff, num_heads=num_heads)
# Forward
output = encoder_layer.forward(x)
print(f"output:\n{output}")

DEBUG - return: (5, 3) (5, 5)
DEBUG - return: (5, 3) (5, 5)
DEBUG - return: (5, 3) (5, 5)
output:
tensor([[ 0.1950,  0.4481, -0.7200, -0.2458, -0.1987,  1.3861, -0.7900,  1.6352,
         -1.7099],
        [-0.6894,  0.5175,  0.6528, -1.1169,  1.1802,  0.7534,  0.2656,  0.5087,
         -2.0720],
        [ 0.0608,  0.1853,  0.4344, -0.2785,  0.0940,  1.1306, -0.6521,  1.3168,
         -2.2912],
        [-1.8131,  1.3276, -1.1554, -0.5013, -0.0845,  0.9658,  0.3745, -0.2242,
          1.1106],
        [ 0.3105,  0.4312,  0.3239, -0.2927,  0.5611,  1.0954, -0.8275,  0.7872,
         -2.3891]], grad_fn=<AddBackward0>)
