<a href="https://colab.research.google.com/github/ejmejm/multilingual-nmt-mt5/blob/main/nmt_full_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers sentencepiece datasets
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.0-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.8.2 sacrebleu-2.4.0


In [4]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook
import re
from tqdm import tqdm
sns.set()
import sacrebleu

In [None]:
regex_vietnamese = re.compile(r'[^a-zđáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựỳỵỷỹ\s]', re.IGNORECASE)
regex_spanish = re.compile(r'[^a-záéíóúñ\s]', re.IGNORECASE)

def clean_text(text, regex):
    if text and isinstance(text, str):
        text = regex.sub('', text).lower().strip()
        return " ".join(text.split())
    return ""

def load_and_prepare_data_vi_es(file_path):
    df = pd.read_csv(file_path)
    df['vi'] = df['vi'].apply(lambda x: clean_text(x, regex_vietnamese))
    df['es'] = df['es'].apply(lambda x: clean_text(x, regex_spanish))

    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

df_vi_es = load_and_prepare_data_vi_es('/content/drive/MyDrive/Colab Notebooks/data/vi-es/df_vi_es.csv')

In [None]:
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
config = MT5Config()

config.decoder_start_token_id = config.pad_token_id

model = MT5ForConditionalGeneration(config)


In [7]:

LANG_TOKEN_MAPPING = {
    'vi': '<vi>',
    'es': '<es>'
}

In [8]:
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(250102, 512)

In [None]:
def encode_str(text, tokenizer, seq_len):

    encoded = tokenizer.encode(text, max_length=seq_len, truncation=True, return_tensors="pt")

    if encoded.size(1) < seq_len:
        padding = torch.full((1, seq_len - encoded.size(1)), tokenizer.pad_token_id)
        encoded = torch.cat([encoded, padding], dim=1)

    return encoded

In [None]:


def encode_str_with_lang_tag(text, target_lang, tokenizer, seq_len):
    text_with_tag = LANG_TOKEN_MAPPING[target_lang] + ' ' + text

    return encode_str(text_with_tag, tokenizer, seq_len)


def process_dataset_with_lang_tag(dataset, input_lang, target_lang, tokenizer, seq_len):
    dataset['input_encoded'] = dataset[input_lang].apply(lambda x: encode_str_with_lang_tag(x, target_lang, tokenizer, seq_len))
    dataset['target_encoded'] = dataset[target_lang].apply(lambda x: encode_str(x, tokenizer, seq_len))
    return dataset[['input_encoded', 'target_encoded']]

In [None]:
def process_batch(batch_df, input_lang, target_lang, tokenizer, seq_len):
    return process_dataset_with_lang_tag(batch_df, input_lang, target_lang, tokenizer, seq_len)
def process_and_save_batches(df, input_lang, target_lang, tokenizer, seq_len, batch_size, output_path):
    for start_row in range(0, df.shape[0], batch_size):
        end_row = min(start_row + batch_size, df.shape[0])
        batch_df = df.iloc[start_row:end_row]
        processed_batch = process_batch(batch_df, input_lang, target_lang, tokenizer, seq_len)
        processed_batch.to_csv(f'{output_path}_{start_row}_{end_row}.csv', index=False)


In [None]:
batch_size = 100000  
seq_len = 101
process_and_save_batches(df_vi_es, 'vi', 'es', tokenizer, seq_len, batch_size, 'processed_vi_es')
process_and_save_batches(df_vi_es, 'es', 'vi', tokenizer, seq_len, batch_size, 'processed_es_vi')


In [None]:
def combine_csv_files(file_pattern, output_file):
    # Tìm tất cả các file phù hợp với mẫu file_pattern
    all_files = [f for f in os.listdir('.') if f.startswith(file_pattern)]

    # Đọc và gộp tất cả các file
    combined_df = pd.concat([pd.read_csv(f) for f in all_files])

    # Lưu kết quả vào một file mới
    combined_df.to_csv(output_file, index=False)

    return combined_df

In [None]:
combined_vi_es = combine_csv_files('processed_vi_es', 'combined_vi_es.csv')
combined_es_vi = combine_csv_files('processed_es_vi', 'combined_es_vi.csv')

In [None]:
combined_all = pd.concat([combined_vi_en, combined_en_vi, combined_en_es, combined_es_en, combined_vi_es, combined_es_vi])

In [None]:
combined_all.to_csv('/content/drive/MyDrive/Colab Notebooks/data/song_ngu_datasets.csv', index=False)

In [10]:
combined_all =  pd.read_csv("/kaggle/input/song-ngu-dataset/song_ngu_datasets.csv")


In [11]:
combined_all = combined_all.sample(frac=1, random_state=42).reset_index(drop=True)


In [15]:
def convert_string_to_tensor(string):
    clean_string = re.sub(r'\s+', ' ', string) 
    clean_string = re.sub(r'tensor\(\[\[|\]\]\)', '', clean_string).strip()
    list_of_ints = [int(i) for i in clean_string.split(',')]
    return torch.tensor(list_of_ints)

combined_all['input_encoded'] = combined_all['input_encoded'].apply(convert_string_to_tensor)
combined_all['target_encoded'] = combined_all['target_encoded'].apply(convert_string_to_tensor)


In [17]:
from torch.utils.data import Dataset, DataLoader, random_split
import torch

class TranslationDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        return {
            'input': item['input_encoded'],
            'target': item['target_encoded']
        }


In [18]:
def create_data_loaders(dataset, batch_size=15):
    torch.manual_seed(42)
    train_size = int(0.997 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

In [19]:
def tensor_to_string(tensor):
    token_ids = tensor.cpu().numpy()
    return tokenizer.decode(token_ids, skip_special_tokens=True)


In [20]:
dataset = TranslationDataset(combined_all)
train_loader, test_loader = create_data_loaders(dataset)

In [23]:
optimizer = AdamW(model.parameters(), lr=5e-5)



In [28]:
checkpoint_path = '/kaggle/input/song-ngu-epoch-12-to-step-30000-epoch-14/_epoch_27.pt'
model_path = '/kaggle/working/'

In [29]:
def test_bleu_on_subset_hf(model, test_loader, device, num_batches_to_test, tokenizer, max_length=400):
    model.eval()
    metric = load_metric("bleu")
    references = []
    hypotheses = []

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            if i >= num_batches_to_test:
                break
            input_ids = batch['input'].to(device)
            target_ids = batch['target'].to(device)

            outputs = model.generate(input_ids, max_length=max_length)
            hypotheses.extend([tokenizer.decode(ids, skip_special_tokens=True).split() for ids in outputs])  
            references_batch = [tokenizer.decode(ids, skip_special_tokens=True).split() for ids in target_ids]  
            references.extend([ref] for ref in references_batch) 

    for hypothesis, reference in zip(hypotheses, references):
        metric.add(prediction=hypothesis, references=reference)  

    final_score = metric.compute()["bleu"]
    return final_score

# Hàm huấn luyện

In [30]:
def train_and_save_model(model, train_loader, test_loader, optimizer, device, num_epochs, save_path, tensor_to_string, start_epoch, start_global_step):
    global_step = 0
    model.to(device)  

    for epoch in range(start_epoch, num_epochs):
        model.train()
        total_loss = 0
        interval_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch in progress_bar:
            if global_step < start_global_step:
                global_step += 1
                continue
            input_ids = batch['input'].to(device)
            attention_mask = (input_ids != model.config.pad_token_id).long()
            target_ids = batch['target'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            loss_item = loss.item()
            total_loss += loss_item
            interval_loss += loss_item
            global_step += 1

            if global_step % 5000 == 0:
                avg_interval_loss = interval_loss / 5000
                print(f"Average Loss over last 5000 steps at step {global_step}: {avg_interval_loss}")
                interval_loss = 0  # Reset interval loss for the next 5000 steps

            if global_step % 10000 == 0:
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'global_step': global_step,
                    'epoch': epoch
                }, f"{save_path}_step_{global_step}_epoch_{epoch}.pt")

            progress_bar.set_postfix({'loss': loss_item})

        avg_loss = total_loss / len(train_loader)
        print(f"Trung bình Loss Epoch {epoch+1}: {avg_loss}")

        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step,
            'epoch': epoch
        }, f"{save_path}_epoch_{epoch}.pt")
        
        start_global_step = 0
        global_step = 0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_checkpoint(model, optimizer, checkpoint_path, device):
    model.to(device)
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])

    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    start_global_step = checkpoint['global_step']
    return start_epoch, start_global_step

model.to(device)

start_epoch, start_global_step = load_checkpoint(model, optimizer, checkpoint_path, device)

train_and_save_model(model, train_loader, test_loader, optimizer, device, 30, model_path, tensor_to_string, start_epoch, start_global_step)

