# Import necessary libraries

## Pip install

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install datasets evaluate sacrebleu transformers[torch]
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

## Others

In [None]:
import json
import os
import pandas as pd
import numpy as np
import torch

import torch.nn as nn
import torch.optim as optim
from torch.utils import data

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset

In [None]:
%cd "/content/drive/MyDrive/ML/MachineTranslation"

/content/drive/MyDrive/ML/MachineTranslation


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Config

In [None]:
max_token_length = 128
sample_used = 10000
epochs = 5

batch_size = 64
learning_rate = 2e-5
weight_decay = 1e-3

embeddings_size = 300
hidden_size = 256
num_layers = 2
dropout_rate = .5


seq_len = 64

# Data preprocessing

## Preprocessing

In [None]:
path_train_data_en = os.path.join(os.getcwd(), "train.en")
path_train_data_vi = os.path.join(os.getcwd(), "train.vi")

with open(path_train_data_en, "r") as file:
    text = file.readlines()
    text = [line.strip() for line in text]
    df_train_en = pd.DataFrame(text, columns=["english"])

with open(path_train_data_vi, "r") as file:
    text = file.readlines()
    text = [line.strip() for line in text]
    df_train_vn = pd.DataFrame(text, columns=["vietnamese"])

df_train = pd.concat([df_train_vn, df_train_en], axis=1)
df_train

Unnamed: 0,vietnamese,english
0,Khoa học đằng sau một tiêu đề về khí hậu,Rachel Pike : The science behind a climate hea...
1,"Trong 4 phút , chuyên gia hoá học khí quyển Ra...","In 4 minutes , atmospheric chemist Rachel Pike..."
2,Tôi muốn cho các bạn biết về sự to lớn của nhữ...,I &apos;d like to talk to you today about the ...
3,Có những dòng trông như thế này khi bàn về biế...,Headlines that look like this when they have t...
4,Cả hai đều là một nhánh của cùng một lĩnh vực ...,They are both two branches of the same field o...
...,...,...
133312,Tôi muốn kết luận rằng hành động của hàng ngàn...,I want to end by saying it &apos;s been the ac...
133313,Rất cảm ơn đã lắng nghe .,Thank you very much for your time .
133314,,
133315,Paul Pholeros : Làm sao để bớt nghèo khổ ? Hãy...,Didier Sornette : How we can predict the next ...


In [None]:
path_val_data_en = os.path.join(os.getcwd(), "tst2012.en")
path_val_data_vi = os.path.join(os.getcwd(), "tst2012.vi")

with open(path_val_data_en, "r") as file:
    text = file.readlines()
    text = [line.strip() for line in text]
    df_val_en = pd.DataFrame(text, columns=["english"])

with open(path_val_data_vi, "r") as file:
    text = file.readlines()
    text = [line.strip() for line in text]
    df_val_vn = pd.DataFrame(text, columns=["vietnamese"])

df_val = pd.concat([df_val_vn, df_val_en], axis=1)
df_val

Unnamed: 0,vietnamese,english
0,Làm sao tôi có thể trình bày trong 10 phút về ...,How can I speak in 10 minutes about the bonds ...
1,Câu chuyện này chưa kết thúc .,This is not a finished story .
2,Nó là một trò chơi ghép hình vẫn đang được xếp .,It is a jigsaw puzzle still being put together .
3,Hãy để tôi kể cho các bạn về vài mảnh ghép nhé .,Let me tell you about some of the pieces .
4,Hãy tưởng tượng mảnh đầu tiên : một người đàn ...,Imagine the first piece : a man burning his li...
...,...,...
1548,Đây không phải là vấn đề giữa quyền riêng tư v...,This is not a question between privacy against...
1549,Đây là một vấn đề giữa tự do và sự kiểm soát .,It &apos;s a question of freedom against contr...
1550,Và trong khi chúng ta vẫn tin tưởng vào chính ...,And while we might trust our governments right...
1551,"Và liệu chúng ta có còn tin tưởng , một cách m...","And do we trust , do we blindly trust , any fu..."


In [None]:
path_test_data_en = os.path.join(os.getcwd(), "tst2013.en")
path_test_data_vi = os.path.join(os.getcwd(), "tst2013.vi")

with open(path_test_data_en, "r") as file:
    text = file.readlines()
    text = [line.strip() for line in text]
    df_test_en = pd.DataFrame(text, columns=["english"])

with open(path_test_data_vi, "r") as file:
    text = file.readlines()
    text = [line.strip() for line in text]
    df_test_vn = pd.DataFrame(text, columns=["vietnamese"])

df_test = pd.concat([df_test_vn, df_test_en], axis=1)
df_test

Unnamed: 0,vietnamese,english
0,"Khi tôi còn nhỏ , Tôi nghĩ rằng BắcTriều Tiên ...","When I was little , I thought my country was t..."
1,Tôi đã rất tự hào về đất nước tôi .,And I was very proud .
2,"Ở trường , chúng tôi dành rất nhiều thời gian ...","In school , we spent a lot of time studying th..."
3,Mặc dù tôi đã từng tự hỏi không biết thế giới ...,Although I often wondered about the outside wo...
4,"Khi tôi lên 7 , tôi chứng kiến cảnh người ta x...","When I was seven years old , I saw my first pu..."
...,...,...
1263,"Tôi thực sự tin , nếu ta coi người khác như nh...","I truly believe , if we can see one another as..."
1264,Những tấm hình không phải là về bản thân vấnđề...,These images are not of issues . They are of p...
1265,Không có ngày nào mà tôi không nghĩ về những n...,There is not a day that goes by that I don &ap...
1266,Tôi hi vọng những tấm hình sẽ đánh thức một ng...,I hope that these images awaken a force in tho...


In [None]:
hf_train = Dataset.from_dict({
    "english": df_train["english"],
    "vietnamese": df_train["vietnamese"]
})

hf_val = Dataset.from_dict({
    "english": df_val["english"],
    "vietnamese": df_val["vietnamese"]
})

hf_test = Dataset.from_dict({
    "english": df_test["english"],
    "vietnamese": df_test["vietnamese"]
})

In [None]:
import random

random.seed(42)

# Sample 500 random indices
random_indices = random.sample(range(len(hf_train)), sample_used)

# Select the samples using the indices
hf_train = hf_train.select(random_indices)

In [None]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
vocab_dict = tokenizer.get_vocab()

In [None]:
source_lang = "english"
target_lang = "vietnamese"
prefix = "translate English to Vietnamese: "


def preprocess_function(examples):
    print(examples)
    inputs = [example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, padding="max_length", max_length=max_token_length, truncation=True)
    return model_inputs

In [None]:
hf_val_tokenized = hf_val.map(preprocess_function, batched=True)
hf_train_tokenized = hf_train.map(preprocess_function, batched=True)
hf_test_tokenized = hf_test.map(preprocess_function, batched=True)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, padding="max_length", max_length=max_token_length)

## Data Loader

In [None]:
class CustomDataset(data.Dataset):
    def __init__(self, data):
        self.data = data["input_ids"]
        self.label = data["labels"]
        self.length = len(data)

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]

        data = torch.tensor(data, dtype=torch.int64)
        label = torch.tensor(label, dtype=torch.int64)

        return (data, label)

In [None]:
torch_train_set = CustomDataset(hf_train_tokenized)
torch_val_set = CustomDataset(hf_val_tokenized)
torch_test_set = CustomDataset(hf_test_tokenized)

In [None]:
torch_train_loader = data.DataLoader(torch_train_set, batch_size=batch_size, shuffle=True)
torch_val_loader = data.DataLoader(torch_val_set, batch_size=batch_size, shuffle=True)
torch_test_loader = data.DataLoader(torch_test_set, batch_size=batch_size, shuffle=True)

# Model

In [None]:
def init_lstm_weights(lstm, low=-0.08, high=0.08):
    for name, param in lstm.named_parameters():
        if 'weight' in name:
            nn.init.uniform_(param, low, high)
        elif 'bias' in name:
            nn.init.zeros_(param)

        return lstm

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_layers, drop_out):
        super(Encoder, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(drop_out)
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size,
                              padding_idx=0,
                              max_norm=None,
                              norm_type=2.0,
                              scale_grad_by_freq=False,
                              sparse=False)

        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, dropout=drop_out)

        self.rnn = init_lstm_weights(self.rnn)

    def forward(self, x):
        # x: input shape (seq_max_len, batch_size)
        #print(x.shape)
        embeddings_pre_dropout = self.embeddings(x)

        #print(embeddings_pre_dropout.shape)

        #print(embeddings_pre_dropout.is_leaf)
        #print(x.is_leaf)
        #print(embeddings_pre_dropout.mean().backward())

        embeddings = self.dropout(embeddings_pre_dropout)
        #print(embeddings.is_leaf)

        output, (hidden, cell) = self.rnn(embeddings)

        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_layers, drop_out):
        super(Decoder, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(drop_out)
        self.embeddings = nn.Embedding(vocab_size, embedding_size,
                              padding_idx=0,
                              max_norm=None,
                              norm_type=2.0,
                              scale_grad_by_freq=False,
                              sparse=False)

        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, dropout=drop_out)

        self.rnn = init_lstm_weights(self.rnn)

        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        # x: input shape (batch_size)
        x = x.unsqueeze(0)

        embeddings = self.dropout(self.embeddings(x))

        outputs, (hidden, cell) = self.rnn(embeddings, (hidden, cell))

        predictions = self.fc(outputs)

        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [None]:
class Seq2SeqEncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, rev_words_dict):
        super(Seq2SeqEncoderDecoder, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.rev_words_dict = rev_words_dict
        self.vocab_len = len(rev_words_dict.keys())

    def forward(self, source, target=None, teacher_force_ratio=.5):
        # input shape (seq_len, batch_size)
        batch_size = source.shape[1]
        target_len = max_token_length

        #print(f"Target len: {target_len}, batch_size = {batch_size}")

        outputs = torch.zeros(target_len + 1, batch_size, self.vocab_len).to(device=device)
        outputs[0, :, 1] = torch.tensor(1)

        hidden, cell = self.encoder(source)

        x = torch.tensor(1)

        for i in range(0, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[i] = output

            best_guess = output.argmax(1)

            if target is None:
                x = best_guess
            else:
                x = target[i] if random.random() < teacher_force_ratio else best_guess

        return outputs[1:]

# Train

## Initialize, load, save model

In [None]:
rev_vocab_dict = { item: key for key, item in vocab_dict.items() }

In [None]:
def init_model():
    encoder = Encoder(len(vocab_dict.keys()), embeddings_size, hidden_size, num_layers, dropout_rate)
    decoder = Decoder(len(vocab_dict.keys()), embeddings_size, hidden_size, num_layers, dropout_rate)

    model = Seq2SeqEncoderDecoder(encoder, decoder, rev_vocab_dict).to(device=device)

    #class_weights = torch.tensor(100 * np.ones(model.vocab_len), dtype=torch.float32)
    #class_weights[esp_words_dict["<pad>"]] = torch.tensor(1e-6, dtype=torch.float32)
    #class_weights[esp_words_dict["<s>"]] = torch.tensor(1, dtype=torch.float32)
    #class_weights[esp_words_dict["</s>"]] = torch.tensor(1, dtype=torch.float32)
    #class_weights[esp_words_dict["<unk>"]] = torch.tensor(1, dtype=torch.float32)

    #print(class_weights)

    criterion = nn.CrossEntropyLoss().to(device=device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    return model, criterion, optimizer

def save_model(model, optimizer, epoch, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, path)

def load(model, optimizer, path):
    checkpoint = torch.load(path, map_location=torch.device(device))

    print(type(checkpoint["model_state_dict"]))

    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]

    return model, optimizer, epoch

## Metrics + Summary

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
def compute_metrics(dataset, model=None):
    if model is None:
        MODEL_SAVE_PATH = os.path.join(os.getcwd(), "translation_seq2seq.pt")
        model, _, optimizer = init_model()
        model, _, _ = load(model, optimizer, MODEL_SAVE_PATH)

    model.eval()

    total_pred = []
    total_label = []

    for data, label in dataset:
        data = data.to(device=device)
        label = label.to(device=device)

        data = torch.moveaxis(data, 1, 0)
        label = torch.moveaxis(label, 1, 0)

        prob = model(data, label, 0)
        pred = torch.argmax(prob, dim=2)

        label = torch.moveaxis(label, 0, 1)
        pred = torch.moveaxis(pred, 0, 1)
        pred_decoded = tokenizer.batch_decode(pred, skip_special_tokens=True)
        label_decoded = tokenizer.batch_decode(label, skip_special_tokens=True)
        total_pred.extend(pred_decoded)
        total_label.extend(label_decoded)

        if len(total_pred) != len(total_label):
            print(len(pred_decoded))
            print(len(label_decoded))

    result = metric.compute(predictions=total_pred, references=total_label)
    result = {"bleu": result["score"]}

    return result

In [None]:
compute_metrics(torch_test_loader)

<class 'collections.OrderedDict'>


{'bleu': 0.0}

In [None]:
def summary(loader, model, criterion):
    num_correct = 0
    num_samples = 0
    total_loss = 0
    loss_epoch = 0
    loss_avg = 0

    model.eval()

    acc = 0

    with torch.no_grad():
        for index, (data, label) in enumerate(loader):
            data = data.to(device=device)
            label = label.to(device=device)

            prob = model(data, label)

            pred = torch.argmax(prob, dim=2)

            current_correct = (pred == label).sum()
            current_size = pred.shape[0] * pred.shape[1]

            num_correct += current_correct
            num_samples += current_size

            prob = torch.moveaxis(prob, (1, 2), (0, 1))
            label = torch.moveaxis(label, 1, 0)

            #print(data.shape)
            #print(label.shape)
            #print(pred.shape)

            loss = criterion(prob, label)

            loss_epoch += loss.item()

        acc = float(num_correct)/float(num_samples) * 100.0
        loss_avg = float(loss_epoch)/float(len(loader))

        bleu_score = compute_metrics(loader, model)
    return acc, loss_avg, bleu_score

## Actual training loop

In [None]:
def train(train_loader, val_loader, num_epochs, batch_print=40):
    train_acc_list = []
    train_loss_list = []

    val_acc_list = []
    val_loss_list = []
    val_bleu_score = []

    cur_epoch = -1

    model, criterion, optimizer = init_model()

    numpy_final_result = [[] for _ in range(20)]

    MODEL_SAVE_PATH = os.path.join(os.getcwd(), "translation_seq2seq.pt")
    JSON_SAVE_PATH = os.path.join(os.getcwd(), "translation_seq2seq.json")

    if os.path.exists(MODEL_SAVE_PATH):
        model, optimizer, cur_epoch = load(model, optimizer, path=MODEL_SAVE_PATH)

        #with open(NUMPY_SAVE_PATH, 'rb') as f:
        #    numpy_final_result = pickle.load(f)

        ### LOAD MODEL ###

    for epoch in range(num_epochs):
        if cur_epoch >= epoch:
            continue

        correct_samples = 0
        total_samples = 0

        loss_epoch = 0

        print("----------------------------------------")

        model.train()

        for batch_idx, (data, label) in enumerate(train_loader):
            # Data to CUDA if possible
            data = data.to(device=device)
            label = label.to(device=device)

            data = torch.moveaxis(data, 1, 0)
            label = torch.moveaxis(label, 1, 0)
            #print(data.shape)
            #print(label.shape)

            optimizer.zero_grad()

            prob = model(data, label)
            #prob.requires_grad=True
            prob.retain_grad()

            pred = torch.argmax(prob, dim=2)

            current_correct = (pred == label).sum()
            current_size = pred.shape[0] * pred.shape[1]

            correct_samples += current_correct
            total_samples += current_size

            prob = torch.moveaxis(prob, (1, 2), (0, 1))
            label = torch.moveaxis(label, 1, 0)

            #print(data.shape)
            #print(label.shape)
            #print(pred.shape)
            #print(prob.shape)

            loss = criterion(prob, label)
            loss.retain_grad()
            #loss.requires_grad=True
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
            #optimizer.requires_grad=True
            optimizer.step()

            loss_epoch += loss.item()

            if batch_idx % batch_print == batch_print - 1:
                print(f"Batch {batch_idx + 1}: Accuracy: {float(current_correct) / float(current_size) * 100.0}")
                print(f"Loss: {float(loss.item())}")
                save_model(model=model, optimizer=optimizer, epoch=epoch, path=MODEL_SAVE_PATH)

        # Validation
        val_acc, val_loss, bleu_score = summary(val_loader, model, criterion)

        train_acc_list.append(float(correct_samples) / float(total_samples + 1e-12) * 100.0)
        train_loss_list.append(float(loss_epoch) / float(len(train_loader)))

        val_acc_list.append(val_acc)
        val_loss_list.append(val_loss)
        val_bleu_score.append(bleu_score)

        #for i in range(20):
        #    numpy_final_result[i].extend(final_result[i])
        #    print(f"Prob for {i + 1}: min {np.min(numpy_final_result[i])}, max: {np.max(numpy_final_result[i])}")

        if epoch % 1 == 0:
            save_model(model=model, optimizer=optimizer, epoch=epoch, path=MODEL_SAVE_PATH)

        cur_epoch = epoch

        print(f"Epoch {epoch + 1}:")

        print(f"Train accuracy: {train_acc_list[-1]}%")
        print(f"Train loss: {train_loss_list[-1]}")

        print(f"Val accuracy: {val_acc_list[-1]}%")
        print(f"Val loss: {val_loss_list[-1]}")
        print(f"Val bleu score: {val_bleu_score[-1]}")

In [None]:
train(torch_train_loader, torch_val_loader, num_epochs=epochs, batch_print=50)

----------------------------------------
Batch 50: Accuracy: 0.6591796875
Loss: 10.346455574035645
Batch 100: Accuracy: 31.5673828125
Loss: 10.265756607055664
Batch 150: Accuracy: 48.6083984375
Loss: 10.139357566833496
Epoch 1:
Train accuracy: 20.732343750000002%
Train loss: 10.288811920554775
Val accuracy: 47.124014005151324%
Val loss: 10.047464866638183
----------------------------------------
Batch 50: Accuracy: 45.52001953125
Loss: 8.717680931091309
Batch 100: Accuracy: 47.59521484375
Loss: 6.274238109588623
Batch 150: Accuracy: 39.794921875
Loss: 5.552935600280762
Epoch 2:
Train accuracy: 43.3440625%
Train loss: 7.573926813283544
Val accuracy: 45.18975370251127%
Val loss: 5.527697486877441
----------------------------------------
Batch 50: Accuracy: 45.3369140625
Loss: 4.532596111297607
Batch 100: Accuracy: 47.5341796875
Loss: 3.9897632598876953
Batch 150: Accuracy: 44.25048828125
Loss: 3.7777466773986816
Epoch 3:
Train accuracy: 42.8659375%
Train loss: 4.417494684267956
Val accur

# Evaluation + Inference

In [None]:
compute_metrics(torch_test_loader)

In [None]:
text = "I have a cat"

In [None]:
def inference_directly(text):
    # Tokenize the input text
    inputs = tokenizer(text, padding="max_length", max_length=max_token_length, return_tensors="pt")

    MODEL_SAVE_PATH = os.path.join(os.getcwd(), "translation_seq2seq.pt")
    model, _, optimizer = init_model()
    model, _, _ = load(model, optimizer, MODEL_SAVE_PATH)

    print(inputs["input_ids"].shape)

    # Generate predictions
    with torch.no_grad():
        outputs = model(inputs["input_ids"].squeeze().unsqueeze(1))

    decoded_preds = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_preds

In [None]:
inference_directly(text)

<class 'collections.OrderedDict'>
torch.Size([1, 128])


RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors