<a href="https://colab.research.google.com/github/dl-ub-summer-school/2023/blob/main/DLUB2023_language_model_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import os

In [2]:
!git clone https://github.com/dl-ub-summer-school/2023.git

Cloning into '2023'...
remote: Enumerating objects: 84, done.[K
remote: Counting objects: 100% (84/84), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 84 (delta 25), reused 57 (delta 11), pack-reused 0[K
Unpacking objects: 100% (84/84), 10.41 MiB | 5.37 MiB/s, done.


In [3]:
os.chdir('2023/seminar_transformers/src')

In [4]:
import torch
import torch.nn as nn
import math
import utils
import dataloader
from torch.utils.data import DataLoader
from torch.optim import Adam

In [5]:
os.chdir('../../../')

In [6]:
os.listdir('./')

['.config', '2023', 'sample_data']

In [7]:
# Below all hyper parameters assigned for the
# convenience of notebook, however, it is not a best practice
# as you should probably want to use hydra or argparse instead
gpu_id=0
seed=42
save=0

# device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
print(device)

num_dec_layers=6
max_len=20
model_dim=512
hidden_size=2048
d_k=64
d_v=64
n_head=8
d_prob=0.1
max_norm=5.0

# MASKED_VALUE = -1e9
MASKED_VALUE = float('-inf')

# attention is all you need hyper-params
n_epochs=100
batch_size=128
lr=1e-3
beta1=0.9
beta2=0.98
eps=1e-9
weight_decay=1e-3
# bluescore
k=4

cpu


# Implementing Transformer from scratch

dataloader and dataset_prep.py contents

### 1.2 Embedding layer

### 1.4 Implementation

In [8]:
class Embedding_Layer(nn.Module):

    def __init__(self, num_token, dim_model, max_seq_len, d_prob):
        """Implementation of embedding and positional embeddings.

        Args:
            num_token (int): Number of token
            dim_model (_type_): Dimension of the model
            max_seq_len (_type_): Maximum sequence length
            d_prob (_type_): Dropout probability
        """
        super(Embedding_Layer, self).__init__()
        self.num_token = num_token
        self.dim_model = dim_model
        self.max_seq_len = max_seq_len
        self.d_prob = d_prob
        self.emb = nn.Embedding(num_token, dim_model)
        self.drop_out = nn.Dropout(d_prob)
        self.pos_enc = torch.zeros((self.max_seq_len, self.dim_model))
        for pos in range(self.max_seq_len):
            for idx in range(0, self.dim_model, 2):
                self.pos_enc[pos, idx] = torch.sin(torch.tensor(pos / (10000.0) ** (float(idx) / self.dim_model)))
                self.pos_enc[pos, idx + 1] = torch.cos(torch.tensor(pos / (10000.0) ** (float(idx) / self.dim_model)))

    def forward(self, x):
        x = self.emb(x)
        x = x + self.pos_enc[:x.size(1)]
        x = self.drop_out(x)
        return x

In [9]:
# ### Test it out
# emb_test = Embedding_Layer(30, 10, 4, 0.1)
# input_test = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
# emb_test(input_test).size()

## 3. Transformer Decoder

### 3.1 Scaled Dot-Product Attention (Masked)

In [10]:
class MaskedScaledDotProductAttention(nn.Module):
    def __init__(self, d_k, d_prob):
        super(MaskedScaledDotProductAttention, self).__init__()
        self.d_k = d_k
        self.softmax = nn.Softmax(dim=-1)
        self.drop_out = nn.Dropout(d_prob)

    def forward(self, x_q, x_k, x_v):
        dot_product = torch.matmul(x_q, x_k.transpose(-1, -2))
        scaled_dot_product = dot_product / math.sqrt(self.d_k)
        true_arr = torch.ones_like(scaled_dot_product)
        mask = torch.tril(true_arr).bool()
        scaled_dot_product = scaled_dot_product.masked_fill(mask==False, MASKED_VALUE)
        reg_scaled_dot_product = self.softmax(scaled_dot_product)
        reg_scaled_dot_product = self.drop_out(reg_scaled_dot_product)
        scaled_dot_product_attn = torch.matmul(reg_scaled_dot_product, x_v)
        return scaled_dot_product_attn

In [11]:
x = torch.randn(1, 5, 3)
masked_attn = MaskedScaledDotProductAttention(3, 0)
print(x)

tensor([[[ 0.8741,  0.4530, -0.3194],
         [ 1.7523, -0.4448,  0.0122],
         [-0.7552, -0.8099,  0.4971],
         [-1.3694,  0.1062,  0.0373],
         [ 1.8494, -0.2583, -1.1109]]])


In [12]:
masked_attn = MaskedScaledDotProductAttention(3, 0)
masked_attn(x, x, x)

tensor([[[ 0.8741,  0.4530, -0.3194],
         [ 1.5365, -0.2242, -0.0692],
         [-0.0932, -0.5623,  0.2951],
         [-0.8251, -0.1781,  0.1495],
         [ 1.6474, -0.2322, -0.6866]]])

In [13]:
true_arr = torch.ones(5,5)
mask = torch.tril(true_arr).bool()
print(mask)

tensor([[ True, False, False, False, False],
        [ True,  True, False, False, False],
        [ True,  True,  True, False, False],
        [ True,  True,  True,  True, False],
        [ True,  True,  True,  True,  True]])


### 3.2 Masked Multi-Head Attention

In [14]:
class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, dim_model, d_k, d_v, n_head, d_prob):
        super(MaskedMultiHeadAttention, self).__init__()
        self.dim_model = dim_model
        self.d_k = d_k
        self.d_v = d_v
        self.n_head = n_head

        self.w_q = nn.Linear(dim_model, n_head * d_k)
        self.w_k = nn.Linear(dim_model, n_head * d_k)
        self.w_v = nn.Linear(dim_model, n_head * d_v)
        self.w_o = nn.Linear(n_head * d_v, dim_model)

        self.masked_scaled_dot_prod = MaskedScaledDotProductAttention(d_k, d_prob)

    def forward(self, q, k, v):
        x_q = self.w_q(q).view(len(q), -1, self.n_head, self.d_k).transpose(1, 2)
        x_k = self.w_k(k).view(len(k), -1, self.n_head, self.d_k).transpose(1, 2)
        x_v = self.w_v(v).view(len(v), -1, self.n_head, self.d_v).transpose(1, 2)
        scaled_dot_product_attn = self.masked_scaled_dot_prod(x_q, x_k, x_v)
        scaled_dot_product_attn = scaled_dot_product_attn.transpose(1, 2)
        scaled_dot_product_attn = scaled_dot_product_attn.reshape(len(v), -1, self.d_v * self.n_head)
        # return scaled_dot_product_attn.shape, scaled_dot_product_attn
        output = self.w_o(scaled_dot_product_attn)
        return output

In [15]:
x = torch.rand(2,3)
print(x)

tensor([[0.3526, 0.3387, 0.2679],
        [0.0920, 0.1494, 0.5228]])


In [16]:
y = x

In [17]:
x += 10
print(y)

tensor([[10.3526, 10.3387, 10.2679],
        [10.0920, 10.1494, 10.5228]])


In [18]:
x = torch.rand(2,3)
print(y,x)

tensor([[10.3526, 10.3387, 10.2679],
        [10.0920, 10.1494, 10.5228]]) tensor([[0.9076, 0.7688, 0.7885],
        [0.4237, 0.1943, 0.4677]])


In [19]:
mod = MaskedMultiHeadAttention(4, 2, 2, 2, 0)
x = torch.randn(1, 5, 4)
print(x)

tensor([[[-0.8556, -0.7234, -1.3710, -0.7861],
         [-1.5740,  0.0348,  0.0625, -0.6335],
         [-0.2133,  0.2485,  0.8819, -0.8583],
         [ 0.8979,  1.4713, -0.7775, -1.2555],
         [-1.4898, -3.2574,  0.8289,  0.0620]]])


In [20]:
mod(x, x ,x)

tensor([[[ 0.2594, -0.2722, -0.8489,  0.2664],
         [ 0.1221, -0.1657, -0.6419,  0.3495],
         [ 0.0653, -0.2347, -0.3988,  0.4096],
         [ 0.0837, -0.4569, -0.1734,  0.4193],
         [ 0.0174, -0.1026, -0.3378,  0.3357]]], grad_fn=<ViewBackward0>)

In [21]:
class FFNN(nn.Module):
    def __init__(self, dim_model, dim_hidden, d_prob):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(dim_model, dim_hidden)
        self.fc2 = nn.Linear(dim_hidden, dim_model)

        self.relu = nn.ReLU()
        self.drop_out = nn.Dropout(d_prob)

    def forward(self, x):
        output = self.fc2(self.drop_out(self.relu(self.fc1(x))))
        return output

### 3.3 Decoder Layer

In [22]:
class DecoderLayer(nn.Module):
    def __init__(self, dim_model, d_k, d_v, n_head, dim_hidden, d_prob):
        super(DecoderLayer, self).__init__()
        self.dim_model = dim_model
        self.d_k = d_k
        self.d_v = d_v
        self.n_head = n_head
        self.dim_hidden = dim_hidden
        self.d_prob = d_prob

        self.masked_multi_head_attention = MaskedMultiHeadAttention(dim_model, d_k, d_v, n_head, d_prob)
        self.ffnn = FFNN(dim_model, dim_hidden, d_prob)

        self.layer_norm1 = nn.LayerNorm(dim_model)
        self.layer_norm2 = nn.LayerNorm(dim_model)

        self.drop_out = nn.Dropout(d_prob)

    def forward(self, x):
        x_residual = x
        x = self.masked_multi_head_attention(x, x, x)
        x = self.layer_norm1(x + x_residual)
        x_residual = x
        x = self.ffnn(x)
        x = self.layer_norm2(x + x_residual)
        return x

### 3.4 Decoder

In [23]:
class Decoder(nn.Module):
    def __init__(self, dim_model, d_k, d_v, n_head, dim_hidden, d_prob, n_dec_layer):
        super(Decoder, self).__init__()
        self.dim_model = dim_model
        self.d_k = d_k
        self.d_v = d_v
        self.n_head = n_head
        self.dim_hidden = dim_hidden
        self.d_prob = d_prob
        self.n_dec_layer = n_dec_layer

        self.dec_layers = nn.ModuleList([DecoderLayer(dim_model, d_k, d_v, n_head, dim_hidden, d_prob) for _ in range(n_dec_layer)])

    def forward(self, x):
        for layer in self.dec_layers:
            x = layer(x)
        return x

## 4. Transformer

In [24]:
class Transformer(nn.Module):
    def __init__(self, num_token, max_seq_len, dim_model, d_k=64, d_v=64, n_head=8, dim_hidden=2048, d_prob=0.1, n_dec_layer=6):
        super(Transformer, self).__init__()

        self.num_token = num_token
        self.max_seq_len = max_seq_len
        self.embed = Embedding_Layer(num_token=num_token, dim_model=dim_model, max_seq_len=max_seq_len, d_prob=d_prob)
        self.decoder = Decoder(dim_model, d_k, d_v, n_head, dim_hidden, d_prob, n_dec_layer)
        self.linear = nn.Linear(dim_model, num_token)
        self.logsoftmax = nn.LogSoftmax(dim=-1)

    def forward(self, src):
        positional_encoded_src = self.embed(src) # [128, 20, 512]
        # return positional_encoded_src
        dec_output = self.decoder(positional_encoded_src)
        outputs = self.logsoftmax(self.linear(dec_output))
        return outputs


In [25]:
batch = 4
voc_size = 2
max_len =10
x = torch.ones(batch, max_len).long()
model = Transformer(voc_size, max_len, 32, 8, 8, 4, 128, 0, 2)
y = model(x)
y.shape # batch, max_len, voc_size

torch.Size([4, 10, 2])

### Training Transformer

In [26]:
def train(dataloader, epochs, model, criterion, vocab, i2w):
	model.train()
	model.zero_grad()
	optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay, betas=(beta1, beta2), eps=eps)
	correct = 0

	cnt = 0
	total_score = 0.
	global_step = 0
	tr_loss = 0.
	for epoch in range(epochs):

		for idx, (src, tgt) in enumerate(dataloader):
			src, tgt = src.to(device), tgt.to(device) # Russian, English sentences

			src = tgt[:, :-1]
			tgt = tgt[:, 1:]
			# print(src.size(), tgt.size())
			# print(src)
			# print(tgt)
			# return 0
			optimizer.zero_grad()

			outputs = model(src)

			tgt = torch.flatten(tgt)
			outputs = outputs.reshape(len(tgt), -1)
			loss = criterion(outputs, tgt)
			tr_loss += loss.item()

			loss.backward()
			torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
			optimizer.step()
			global_step += 1

			pred = outputs.argmax(dim=1, keepdim=True)
			pred_acc = pred
			tgt_acc = tgt

			correct += pred_acc.eq(tgt_acc.view_as(pred_acc)).sum().item()

			cnt += tgt_acc.shape[0]
			score = 0.

			# with torch.no_grad():
			# 	pred = pred.reshape(batch_size, max_len, -1).detach().cpu().tolist()
			# 	tgt = tgt.reshape(batch_size, max_len).detach().cpu().tolist()
			# 	for p, t in zip(pred, tgt):
			# 		eos_idx = t.index(vocab['[PAD]']) if vocab['[PAD]'] in t else len(t)
			# 		p_seq = [i2w[i[0]] for i in p][:eos_idx]
			# 		t_seq = [i2w[i] for i in t][:eos_idx]
			# 		k = 4 if len(t_seq) > 4 else len(t_seq)
			# 		s = utils.bleu_score(p_seq, t_seq, k=k)
			# 		score += s
			# 		total_score += s

			# score /= batch_size

			print("\r[epoch {:3d}/{:3d}] [batch {:4d}/{:4d}] loss: {:.6f} acc: {:.4f} BLEU: {:.4f})".format(
				epoch, n_epochs, idx + 1, len(dataloader), loss, correct / cnt, score), end=' ')

	tr_loss /= cnt
	tr_acc = correct / cnt
	tr_score = total_score / len(dataloader.dataset) / epochs

	return tr_loss, tr_acc, tr_score

def eval(dataloader, model, lengths=None):
    # Implement th
		pass


In [None]:
utils.set_random_seed(42)
dataset_dir = '2023/seminar_transformers/data/wmt15_russian_to_english'
# dataset_dir = '../data/wmt16_turkish_to_english'
max_len=20
print(max_len)
trn_dataset = dataloader.MiniWMT15_en_ru_Dataset(max_len=20, src_filepath=f'{dataset_dir}/src_train.txt', tgt_filepath=f'{dataset_dir}/tgt_train.txt', vocab=(None, None), is_src=True, is_tgt=False, is_train=True)
test_dataset = dataloader.MiniWMT15_en_ru_Dataset(max_len=20, src_filepath=f'{dataset_dir}/src_test.txt', tgt_filepath=None, vocab=(trn_dataset.vocab, None), is_src=True, is_tgt=False, is_train=False)
vocab = trn_dataset.vocab

i2w = {v: k for k, v in vocab.items()}
trn_dataloader = DataLoader(trn_dataset, batch_size=128, shuffle=True, drop_last=True, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, drop_last=False, num_workers=2)
n_token = len(trn_dataset.vocab)
model = Transformer(num_token=n_token, max_seq_len=max_len, dim_model=model_dim, d_k=d_k, d_v=d_v, n_head=n_head, dim_hidden=hidden_size, d_prob=d_prob, n_dec_layer=num_dec_layers)
model = model.to(device)
criterion = nn.NLLLoss(ignore_index=vocab['[PAD]'])

n_epochs=5
tr_loss, tr_acc, tr_score = train(trn_dataloader, n_epochs, model, criterion, vocab, i2w)
# tr_loss, tr_acc, tr_score = train_debug(trn_dataloader, 1, model, criterion, vocab, i2w)
# print("tr: ({:.4f}, {:5.2f}, {:5.2f}) | ".format(tr_loss, tr_acc * 100, tr_score * 100), end='')


20
[epoch   0/  5] [batch    7/   7] loss: 5.645875 acc: 0.1004 BLEU: 0.0000) 

In [None]:
def generate_sentence():
  input_text = torch.zeros(1,1).long()

  output_text = []
  for i in range(20):
    outputs = model(input_text)
    pred = outputs[:, -1:].argmax(dim=-1)
    input_text = torch.concat([input_text, pred], dim=1)

    output_text.append(pred)
  return " ".join([i2w[i] for i in input_text.flatten().tolist()])
generate_sentence()