In [1]:
!pip install transformers -q
!pip install evaluate -q
!pip install sacrebleu -q
!pip install accelerate -q
!pip install unbabel-comet -q

import warnings
warnings.filterwarnings('ignore')

from transformers import MT5ForConditionalGeneration, MT5Tokenizer, T5Config
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
import os, shutil
import evaluate
import numpy as np
from accelerate import Accelerator
import matplotlib.pyplot as plt
from comet import download_model, load_from_checkpoint

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()

from google.colab import drive
drive.mount('/content/drive')

# Define Settings
batch_size = 2  # Reduced batch size to manage memory usage
num_epochs = 2
learning_rate = 5e-4  # Lower learning rate
model_path = "/content/drive/MyDrive/ColabNotebooks/model/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def create_folder(path_folder):
    if os.path.exists(path_folder):
        shutil.rmtree(path_folder)
    os.makedirs(path_folder)
    return None

# Define Data reader class
print("Using device:", device)

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.data = self.data.dropna()
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return f"translate English to French: {item['en']}", item['fr']

def collate_fn(batch):
    source_texts, target_texts = zip(*batch)
    source_encodings = tokenizer(list(source_texts), padding='max_length', truncation=True, max_length=200, return_tensors='pt')
    target_encodings = tokenizer(list(target_texts), padding='max_length', truncation=True, max_length=200, return_tensors='pt')
    return source_encodings['input_ids'].to(device), target_encodings['input_ids'].to(device)

class MT5WithPrompts(MT5ForConditionalGeneration):
    def __init__(self, config, prompt_length=20):
        super().__init__(config)
        self.prompt_length = prompt_length
        self.prompt_embeddings = torch.nn.Embedding(prompt_length, config.d_model)

    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        # Create prompt embeddings
        prompt_ids = torch.arange(self.prompt_length, device=input_ids.device).expand(input_ids.size(0), -1)
        prompt_embeddings = self.prompt_embeddings(prompt_ids)

        # Get input embeddings and concatenate prompts
        input_embeddings = self.get_input_embeddings()(input_ids)
        extended_embeddings = torch.cat([prompt_embeddings, input_embeddings], dim=1)

        # Adjust attention mask for prompt embeddings
        prompt_attention_mask = torch.ones_like(prompt_ids)
        extended_attention_mask = torch.cat([prompt_attention_mask, attention_mask], dim=1) if attention_mask is not None else None

        # Pass through the model
        outputs = super().forward(inputs_embeds=extended_embeddings, attention_mask=extended_attention_mask, **kwargs)
        return outputs

def load_model():
    model_name = "google/mt5-small"
    tokenizer = MT5Tokenizer.from_pretrained(model_name)
    config = T5Config.from_pretrained(model_name)
    model = MT5WithPrompts(config)

    # Freeze all original MT5 parameters
    for param in model.parameters():
        param.requires_grad = False

    # Only train prompt embeddings
    for param in model.prompt_embeddings.parameters():
        param.requires_grad = True

    model.to(device)
    return model, tokenizer

# data loader
def prepare_data(file_name='filtered-en_fr.csv', nrows=200000):
    df = pd.read_csv(file_name, nrows=nrows)  # reading only portion of data
    df_train = df.sample(frac=0.90, replace=False, random_state=1)  # 90% of data for training
    df_validation = df.loc[~df.index.isin(df_train.index)]  # Corrected to use df_train for exclusion
    print(f'Number of Training Dataset {df_train.shape[0]}, Number of Validation Dataset {df_validation.shape[0]}')
    # train
    train_dataset = TranslationDataset(df_train)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)  # Consider shuffle=True
    # validation
    validation_dataset = TranslationDataset(df_validation)  # Corrected to use df_validation
    validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    return train_dataloader, validation_dataloader, df_validation

# model loader
def load_model():
    model_name = "google/mt5-small"
    tokenizer = MT5Tokenizer.from_pretrained(model_name, legacy=False)
    model = MT5ForConditionalGeneration.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device).float()
    return model, tokenizer

# Load COMET model directly
comet_model_path = download_model("wmt20-comet-da")
comet_model = load_from_checkpoint(comet_model_path)

# trainer function
def trainer(model, tokenizer, optimizer, num_epochs, train_dataloader, validation_dataloader, freq=100):
    bleu_metric = evaluate.load("sacrebleu")
    train_loss = []
    val_loss = []
    bleu_scores = []
    comet_scores = []

    for epoch in range(num_epochs):
        print(f"Starting Epoch {epoch+1}")
        k = 0
        # train step
        model.train()
        total_loss = 0
        for input_ids, labels in train_dataloader:
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss

            # Check for NaNs in loss
            if torch.isnan(loss):
                print("NaN detected in loss. Skipping this batch.")
                continue

            # Check model outputs for NaNs
            if torch.isnan(outputs.logits).any():
                print("NaN detected in model outputs. Skipping this batch.")
                continue

            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            total_loss += loss.detach().float()
            train_loss.append(np.squeeze(loss.detach().cpu().numpy()).item())
            if k % freq == 0:
                print(f"Train Batch - Epoch {epoch+1}, Iter: {k}, Loss: {loss.item()}, Total Loss: {total_loss}")
            k += 1

        # evaluation step
        model.eval()
        eval_loss = 0
        eval_preds = []
        all_preds = []
        all_labels = []
        for input_ids, val_labels in validation_dataloader:
            with torch.no_grad():
                outputs = model(input_ids=input_ids, labels=val_labels)
                loss = outputs.loss

            # Check for NaNs in loss
            if torch.isnan(loss):
                print("NaN detected in evaluation loss. Skipping this batch.")
                continue

            # Check model outputs for NaNs
            if torch.isnan(outputs.logits).any():
                print("NaN detected in model outputs during evaluation. Skipping this batch.")
                continue

            val_loss.append(np.squeeze(loss.detach().cpu().numpy()).item())
            eval_loss += loss.detach().float()
            preds = tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
            eval_preds.extend(preds)
            all_preds.extend(preds)
            decoded_labels = tokenizer.batch_decode(val_labels, skip_special_tokens=True)
            all_labels.extend(decoded_labels)
            bleu_metric.add_batch(predictions=preds, references=decoded_labels)

        eval_epoch_loss = eval_loss / len(validation_dataloader)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

        bleu_results = bleu_metric.compute()
        bleu_scores.append(bleu_results['score'])
        print(f"epoch {epoch+1}, BLEU score: {bleu_results['score']:.2f}")

        # COMET evaluation
        comet_data = [{"src": "", "mt": mt, "ref": ref} for mt, ref in zip(all_preds, all_labels)]
        comet_scores_epoch = comet_model.predict(comet_data, batch_size=8, gpus=1)

        # Print the structure of comet_scores_epoch for debugging
        print(f"comet_scores_epoch: {comet_scores_epoch}")

        # Extract the scores
        comet_scores_epoch = comet_scores_epoch.scores

        comet_score_mean = np.mean(comet_scores_epoch)
        comet_scores.append(comet_score_mean)
        print(f"epoch {epoch+1}, COMET score: {comet_score_mean:.4f}")

        print('\n')

    return train_loss, val_loss, model, bleu_scores, comet_scores

def encode_str(text, tokenizer):
    input_ids = tokenizer.encode(
        text=text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,)
    return input_ids[0]

def random_model_test(model, tokenizer, df_validation, model_cache_path, n=10, verbose=True):
    random_test = df_validation.sample(n=n)
    with open(os.path.join(model_cache_path, 'random_test.txt'), 'w') as the_file:
        for i in range(len(random_test)):
            en_test_data = random_test.iloc[i].en
            fr_test_data = random_test.iloc[i].fr
            en = encode_str(en_test_data, tokenizer).unsqueeze(0).cuda()
            res = model.generate(en)
            res_decoded = tokenizer.decode(res[0], skip_special_tokens=True)
            if verbose:
                print('English Sentence:')
                print(en_test_data)
                print('French Sentence:')
                print(fr_test_data)
                print('Model Output:')
                print(res_decoded)
                print('--------\n')
            the_file.write(f'Test Case {i+1}:' + '\n')
            the_file.write('English Sentence:' + '\n')
            the_file.write(en_test_data + '\n')
            the_file.write('French Sentence:' + '\n')
            the_file.write(fr_test_data + '\n')
            the_file.write('Model Output:' + '\n')
            the_file.write(res_decoded + '\n')
            the_file.write('-------- \n')
    return None

# save model and tokenizer:
def save_model(model, tokenizer, model_cache_path):
    # Save model
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(model_cache_path, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(model_cache_path)
    return None

# save learning rates
def save_learning_rates(train_loss, val_loss, bleu_scores, comet_scores, nrows, num_epochs, model_cache_path):
    # save rates
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 4))
    ax.plot(train_loss, color='blue', label='Training Loss')
    ax.plot(val_loss, color='red', label='Validation Loss')
    ax.set_xlabel('iteration')
    ax.set_ylabel('loss')
    ax.set_ylim([0, 10])
    plt.title(f'{nrows} Rows, {num_epochs} Epochs')
    plt.legend()
    plt.savefig(os.path.join(model_cache_path, 'learning_rates'))
    plt.close(fig)
    # save bleu and comet scores
    scores_pd = pd.DataFrame(columns=['Epoch', 'BLEU Score', 'COMET Score', 'nrows'], index=list(range(1, len(bleu_scores) + 1)))
    scores_pd['BLEU Score'] = bleu_scores
    scores_pd['COMET Score'] = comet_scores
    scores_pd['Epoch'] = list(range(1, len(bleu_scores) + 1))
    scores_pd['nrows'] = nrows
    scores_pd.to_csv(os.path.join(model_cache_path, 'scores.csv'), index=None)
    return None

for nrows in [50000]:
    print(f'===> Number of rows {nrows}')
    # load model
    model, tokenizer = load_model()
    # optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_dataloader, validation_dataloader, df_validation = prepare_data(file_name='/content/drive/MyDrive/ColabNotebooks/filtered_en-fr.csv', nrows=nrows)
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, validation_dataloader
    )
    # define save location
    model_cache_path = os.path.join(model_path, f'nrows_{nrows}')
    create_folder(model_cache_path)
    # train model
    train_loss, val_loss, model, bleu_scores, comet_scores = trainer(model, tokenizer, optimizer, num_epochs, train_dataloader, validation_dataloader, freq=1000)
    # random test
    random_model_test(model, tokenizer, df_validation, model_cache_path, n=20, verbose=False)
    # save model
    save_model(model, tokenizer, model_cache_path)
    # save learning rates
    save_learning_rates(train_loss, val_loss, bleu_scores, comet_scores, nrows, num_epochs, model_cache_path)





[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

wmt20-comet-da.tar.gz: 1.79GB [00:53, 33.5MB/s]                            
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

===> Number of rows 50000


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Number of Training Dataset 45000, Number of Validation Dataset 5000


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Starting Epoch 1
Train Batch - Epoch 1, Iter: 0, Loss: 62.56893539428711, Total Loss: 62.56893539428711
Train Batch - Epoch 1, Iter: 1000, Loss: 0.24563859403133392, Total Loss: 2009.037109375
Train Batch - Epoch 1, Iter: 2000, Loss: 0.22660936415195465, Total Loss: 2423.46435546875
Train Batch - Epoch 1, Iter: 3000, Loss: 0.11559908092021942, Total Loss: 2803.568359375
Train Batch - Epoch 1, Iter: 4000, Loss: 0.16081836819648743, Total Loss: 3159.4140625
Train Batch - Epoch 1, Iter: 5000, Loss: 0.28421393036842346, Total Loss: 3500.35498046875
Train Batch - Epoch 1, Iter: 6000, Loss: 0.15919290482997894, Total Loss: 3824.95751953125
Train Batch - Epoch 1, Iter: 7000, Loss: 0.1531621217727661, Total Loss: 4151.09375
Train Batch - Epoch 1, Iter: 8000, Loss: 0.4880807399749756, Total Loss: 4462.1376953125
Train Batch - Epoch 1, Iter: 9000, Loss: 0.2611546218395233, Total Loss: 4770.3515625
Train Batch - Epoch 1, Iter: 10000, Loss: 0.21777574717998505, Total Loss: 5078.16259765625
Train B

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 625/625 [01:23<00:00,  7.49it/s]


comet_scores_epoch: Prediction([('scores', [0.6736904978752136, -0.8216466307640076, -1.2157262563705444, -1.4137208461761475, -1.219907283782959, -0.7948532700538635, -1.2790464162826538, 0.4798136353492737, -1.5770759582519531, 1.0751678943634033, -1.4072545766830444, -0.5913949608802795, -1.310835361480713, -0.7709866762161255, -1.5297831296920776, -0.8669469952583313, -0.4558289647102356, -1.0431079864501953, -1.8915282487869263, -1.2748407125473022, -1.4828039407730103, -0.3472755253314972, -0.5499823689460754, -1.4916260242462158, -1.2232310771942139, -1.5260999202728271, 0.36327263712882996, -1.4047240018844604, -1.2427034378051758, -1.480255365371704, -1.315824031829834, -1.175252079963684, -1.5064091682434082, -0.9892016053199768, 0.6698414087295532, 1.0701310634613037, -1.0627576112747192, -0.28198015689849854, -0.7190660834312439, -1.4409139156341553, 0.9626983404159546, -1.2430696487426758, -1.323056697845459, -1.1590335369110107, -0.044086113572120667, -1.2634683847427368,

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


epoch 2, BLEU score: 22.58


Predicting DataLoader 0: 100%|██████████| 625/625 [01:24<00:00,  7.41it/s]
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


comet_scores_epoch: Prediction([('scores', [0.6736895442008972, -1.1572470664978027, -1.237973928451538, -1.3701082468032837, -1.151383638381958, -0.4962383210659027, -1.1795114278793335, 0.4798150062561035, -1.6845957040786743, 0.7781121730804443, -1.556243658065796, 0.08499910682439804, -0.9862715601921082, -0.5431719422340393, -1.515524983406067, -0.41854435205459595, -0.21374259889125824, -1.3219082355499268, -1.9317858219146729, -1.3223145008087158, -1.4813770055770874, -0.5937930941581726, -0.2718277871608734, -1.5027121305465698, -1.298652172088623, -1.435537576675415, 0.60542893409729, -1.2996219396591187, -1.4136101007461548, -1.442694067955017, -0.5385352969169617, -1.3981292247772217, -1.5383602380752563, 0.31948140263557434, 0.26775142550468445, 1.0701310634613037, -1.4969713687896729, 0.20768998563289642, -0.5341783165931702, -1.2682044506072998, 1.1192173957824707, -1.1624400615692139, -1.4258650541305542, -1.2600624561309814, -0.6476044058799744, -1.1603929996490479, -1.

In [None]:
!pip install transformers -q
!pip install evaluate -q
!pip install sacrebleu -q
!pip install accelerate -q
!pip install comet
!pip install unbabel-comet
!pip install datasets

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import warnings
import os
import shutil
import pandas as pd
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, T5Config
import evaluate
import numpy as np
from accelerate import Accelerator
import matplotlib.pyplot as plt
from comet import download_model, load_from_checkpoint

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Define Settings
batch_size = 10
num_epochs =  2
learning_rate = 1e-3 # Lower learning rate
model_path ="/content/drive/MyDrive/ColabNotebooks/model/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def create_folder (path_folder):
    if os.path.exists(path_folder):
        shutil.rmtree(path_folder)
    os.makedirs(path_folder)
    return None


In [None]:
# Define Data reader class
print("Using device:", device)

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.data= self.data.dropna()
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return f"translate English to French: {item['en']}", item['fr']

def collate_fn(batch):
    source_texts, target_texts = zip(*batch)
    source_encodings = tokenizer(list(source_texts), padding='max_length', truncation=True, max_length=200, return_tensors='pt')
    target_encodings = tokenizer(list(target_texts), padding='max_length', truncation=True, max_length=200, return_tensors='pt')
    return source_encodings['input_ids'].to(device), target_encodings['input_ids'].to(device)

In [None]:
class MT5WithPrompts(MT5ForConditionalGeneration):
    def __init__(self, config, prompt_length=20):
        super().__init__(config)
        self.prompt_length = prompt_length
        self.prompt_embeddings = torch.nn.Embedding(prompt_length, config.d_model)

    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        # Create prompt embeddings
        prompt_ids = torch.arange(self.prompt_length, device=input_ids.device).expand(input_ids.size(0), -1)
        prompt_embeddings = self.prompt_embeddings(prompt_ids)

        # Get input embeddings and concatenate prompts
        input_embeddings = self.get_input_embeddings()(input_ids)
        extended_embeddings = torch.cat([prompt_embeddings, input_embeddings], dim=1)

        # Adjust attention mask for prompt embeddings
        prompt_attention_mask = torch.ones_like(prompt_ids)
        extended_attention_mask = torch.cat([prompt_attention_mask, attention_mask], dim=1) if attention_mask is not None else None

        # Pass through the model
        outputs = super().forward(inputs_embeds=extended_embeddings, attention_mask=extended_attention_mask, **kwargs)
        return outputs

def load_model():
    model_name = "google/mt5-small"
    tokenizer = MT5Tokenizer.from_pretrained(model_name)
    config = T5Config.from_pretrained(model_name)
    model = MT5WithPrompts(config)

    # Freeze all original MT5 parameters
    for param in model.parameters():
        param.requires_grad = False

    # Only train prompt embeddings
    for param in model.prompt_embeddings.parameters():
        param.requires_grad = True

    model.to(device)
    return model, tokenizer

In [None]:
# data loader
def prepare_data(file_name='filtered-en_fr.csv', nrows=200000):
    df = pd.read_csv(file_name, nrows=nrows)  # reading only portion of data
    df_train = df.sample(frac=0.90, replace=False, random_state=1)  # 90% of data for training
    df_validation = df.loc[~df.index.isin(df_train.index)]  # Corrected to use df_train for exclusion
    print(f'Number of Training Dataset {df_train.shape[0]}, Number of Validation Dataset {df_validation.shape[0]}')
    # train
    train_dataset = TranslationDataset(df_train)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)  # Consider shuffle=True
    # validation
    validation_dataset = TranslationDataset(df_validation)  # Corrected to use df_validation
    validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    return train_dataloader, validation_dataloader, df_validation


In [None]:
# model loader
def load_model():
    model_name = "google/mt5-small"
    tokenizer = MT5Tokenizer.from_pretrained(model_name,legacy=False)
    model = MT5ForConditionalGeneration.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device).float()
    return model, tokenizer

In [None]:
# trainer function
def trainer(model,tokenizer,optimizer,num_epochs, train_dataloader, validation_dataloader,freq=100):
    metric = evaluate.load("sacrebleu")
    comet_metric = evaluate.load('comet')
    train_loss=[]
    val_loss=[]
    blue=[]
    comet=[]

    for epoch in range(num_epochs):
        print(f"Starting Epoch {epoch+1}")
        k=0
        # train step
        model.train()
        total_loss=0
        for input_ids, labels in train_dataloader:
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.detach().float()
            train_loss.append(np.squeeze(loss.detach().cpu().numpy()).item())
            if k % freq==0:
                print(f"Train Batch - Epoch {epoch+1}, Iter: {k}, Loss: {loss.item()}, Total Loss: {total_loss}")
            k+=1

        # evaluation step
        model.eval()
        eval_loss = 0
        eval_preds = []
        for input_ids, val_labels in validation_dataloader:
            with torch.no_grad():
                outputs = model(input_ids=input_ids, labels=val_labels)
            loss = outputs.loss
            val_loss.append(np.squeeze(loss.detach().cpu().numpy()).item())
            eval_loss += loss.detach().float()
            eval_preds.extend(tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True))
            val_preds = tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(val_labels, skip_special_tokens=True)
            metric.add_batch(predictions=val_preds, references=decoded_labels)

        eval_epoch_loss = eval_loss / len(validation_dataloader)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
        results = metric.compute()
        blue.append(results['score'])
        comet.append(results['comet_score'])
        print(f"epoch {epoch+1}, BLEU score: {results['score']:.2f}")
        print(f"epoch {epoch+1}, COMET score: {results['comet_score']:.2f}")
        print('\n')
    return train_loss,val_loss, model, blue, comet


In [None]:
def encode_str(text, tokenizer,):
    input_ids = tokenizer.encode(
      text=text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,)
    return input_ids[0]

In [None]:
def random_model_test(model,tokenizer, df_validation,model_cache_path,n=10,verbose=True):
    random_test= df_validation.sample(n=n)
    with open(os.path.join(model_cache_path, 'random_test.txt'), 'w') as the_file:
        for i in range(len(random_test)):
            en_test_data = random_test.iloc[i].en
            fr_test_data = random_test.iloc[i].fr
            en = encode_str(en_test_data,tokenizer).unsqueeze(0).cuda()
            res  = model.generate(en)
            res_decoded= tokenizer.decode(res[0],skip_special_tokens=True)
            if verbose:
                print('English Sentence:')
                print(en_test_data)
                print('French Sentence:')
                print(fr_test_data)
                print('Model Output:')
                print(res_decoded)
                print('--------\n')
            the_file.write(f'Test Case {i+1}:' +'\n')
            the_file.write('English Sentence:' +'\n')
            the_file.write(en_test_data +'\n')
            the_file.write('French Sentence:' +'\n')
            the_file.write(fr_test_data +'\n')
            the_file.write('Model Output:' +'\n')
            the_file.write(res_decoded +'\n')
            the_file.write('-------- \n')
    return None

In [None]:
# save model and tokennizer:
def save_model(model, tokenizer,model_cache_path):
    # Save model
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(model_cache_path, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(model_cache_path)
    return None


In [None]:
# save learning rates
def save_learning_rates(train_loss,val_loss, blue,comet, nrows, num_epochs, model_cache_path):
    # save rates
    fig,ax = plt.subplots(nrows=1,ncols=1,figsize=(8,4))
    ax.plot(train_loss,color='blue',label='Training Loss')
    ax.plot(val_loss,color='red',label='Validation Loss')
    ax.set_xlabel('iterration')
    ax.set_ylabel('loss')
    ax.set_ylim([0,10])
    plt.title(f'{nrows} Rows, {num_epochs} Epochs')
    plt.legend()
    plt.savefig(os.path.join(model_cache_path,'learning_rates'))
    plt.close(fig)
    # save blue score
    blue_pd= pd.DataFrame(columns=['Epoch','Blue Score','nrows',],index=list(range(1,len(blue)+1)))
    blue_pd['Blue Score'] = blue
    blue_pd['Epoch']=list(range(1,len(blue)+1))
    blue_pd['nrows']=nrows
    blue_pd.to_csv(os.path.join(model_cache_path,'blue_score.csv'),index=None)

    # save COMET score
    comet_pd= pd.DataFrame(columns=['Epoch','COMET Score','nrows',],index=list(range(1,len(comet)+1)))
    comet_pd['COMET Score'] = comet
    comet_pd['Epoch']=list(range(1,len(blue)+1))
    comet_pd['nrows']=nrows
    comet_pd.to_csv(os.path.join(model_cache_path,'comet_score.csv'),index=None)
    return None

In [None]:
for nrows in [5000, 10000, 50000,100000, 200000]:
    print(f'===> Number of rows {nrows}')
    # load model
    model, tokenizer = load_model()
    # optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_dataloader, validation_dataloader, df_validation = prepare_data(file_name='/content/drive/MyDrive/ColabNotebooks/filtered_en-fr.csv',nrows=nrows)
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, validation_dataloader
    )
    # define save location
    model_cache_path = os.path.join(model_path, f'nrows_{nrows}')
    create_folder (model_cache_path)
    # train model
    train_loss,val_loss, model, blue = trainer(model,tokenizer,optimizer,num_epochs, train_dataloader, validation_dataloader,freq=1000)
    train_loss,val_loss, model, comet = trainer(model,tokenizer,optimizer,num_epochs, train_dataloader, validation_dataloader,freq=1000)
    # random test
    random_model_test(model,tokenizer, df_validation,model_cache_path,n=20, verbose=False)
    # save model
    save_model(model, tokenizer,model_cache_path)
    # save learning rates
    save_learning_rates(train_loss,val_loss,blue,comet, nrows, num_epochs, model_cache_path)


In [None]:
#shutil.make_archive('/mnt/code/junk/all.zip', 'zip', './model/')