In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
%%capture
!pip install --upgrade datasets

In [5]:
%%capture
!pip install evaluate
!pip install sacrebleu

In [23]:
%%capture
!pip install git+https://github.com/csebuetnlp/normalizer

In [24]:
%%capture
!pip install sentencepiece

In [6]:
import pandas as pd
import torch
import unicodedata
from datasets import Dataset
from transformers import MT5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoTokenizer
import evaluate
import os

In [7]:
bleu = evaluate.load("sacrebleu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Preprocessing functions
def standardize_to_NFC(text_list):
    """Normalize the text to NFC form for consistent diacritic handling."""
    return [unicodedata.normalize('NFC', text) for text in text_list]

def filter_single_word_sentence(eng_sents, yor_sents):
    """Filter out sentences that are single words in either language."""
    eng_inds = set([i for i, sent in enumerate(eng_sents) if len(sent.split()) > 1])
    yor_inds = set([i for i, sent in enumerate(yor_sents) if len(sent.split()) > 1])
    common_inds = sorted(list(eng_inds & yor_inds))

    eng_filtered = [eng_sents[i] for i in common_inds]
    yor_filtered = [yor_sents[i] for i in common_inds]

    return eng_filtered, yor_filtered

In [13]:
def load_and_preprocess_data(input_dir):
    """Load data from CSV, normalize and filter it."""
    # Example dataset paths
    train_file = os.path.join(input_dir, 'train.tsv')  # Adjust the path to your training data
    val_file = os.path.join(input_dir, 'dev.tsv')      # Adjust the path to your validation data
    test_file = os.path.join(input_dir, 'test.tsv')

    # Load the training data
    train_dataset = pd.read_csv (train_file, sep='\t', names=['English', 'Yoruba'])
    val_dataset = pd.read_csv(val_file, sep='\t', names=['English', 'Yoruba'])
    test_dataset = pd.read_csv(test_file, sep='\t', names=['English', 'Yoruba'])

    # Normalize Yorùbá sentences to NFC
    # train_df['Yoruba'] = standardize_to_NFC(train_df['Yoruba'])
    # val_df['Yoruba'] = standardize_to_NFC(val_df['Yoruba'])

    # # Filter out single-word sentences
    # train_en, train_yo = filter_single_word_sentence(train_df['English'], train_df['Yoruba'])
    # val_en, val_yo = filter_single_word_sentence(val_df['English'], val_df['Yoruba'])

    # # Create Hugging Face dataset from pandas DataFrame
    # train_dataset = Dataset.from_pandas(pd.DataFrame({'en': train_en, 'yo': train_yo}))
    # val_dataset = Dataset.from_pandas(pd.DataFrame({'en': val_en, 'yo': val_yo}))

    return train_dataset, val_dataset, test_dataset

In [30]:
# 1. Load and preprocess the Menyo-20k_MT dataset
input_dir = '/content/drive/Shareddrives/Bootcamp/data/'  # Specify the path to your dataset directory
train_dataset, val_dataset, test_dataset = load_and_preprocess_data(input_dir)

In [31]:
train_dataset.head()

Unnamed: 0,English,Yoruba
0,English,Yoruba
1,Unit 1: What is Creative Commons?,﻿Ìdá 1: Kín ni Creative Commons?
2,This work is licensed under a Creative Commons...,Iṣẹ́ yìí wà lábẹ́ àṣẹ Creative Commons Attribu...
3,"Creative Commons is a set of legal tools, a no...",Creative Commons jẹ́ àwọn ọ̀kan-ò-jọ̀kan ohun-...
4,Creative Commons began in response to an outda...,Creative Commons bẹ̀rẹ̀ láti wá wọ̀rọ̀kọ̀ fi ṣ...


In [32]:
# Rename the columns to match the expected format
train_dataset.rename(columns={'English': 'input_text', 'Yoruba': 'labels'}, inplace=True)
train_dataset.head()

Unnamed: 0,input_text,labels
0,English,Yoruba
1,Unit 1: What is Creative Commons?,﻿Ìdá 1: Kín ni Creative Commons?
2,This work is licensed under a Creative Commons...,Iṣẹ́ yìí wà lábẹ́ àṣẹ Creative Commons Attribu...
3,"Creative Commons is a set of legal tools, a no...",Creative Commons jẹ́ àwọn ọ̀kan-ò-jọ̀kan ohun-...
4,Creative Commons began in response to an outda...,Creative Commons bẹ̀rẹ̀ láti wá wọ̀rọ̀kọ̀ fi ṣ...


In [33]:
val_dataset.head()

Unnamed: 0,English,Yoruba
0,English,Yoruba
1,"We prepare the saddle, and the goat presents i...",A di gàárì sílẹ̀ ewúrẹ́ ń yọjú; ẹrù ìran rẹ̀ ni?
2,"You have been crowned a king, and yet you make...",A fi ọ́ jọba ò ń ṣàwúre o fẹ́ jẹ Ọlọ́run ni?
3,By dancing we take possession of Awà; through ...,"A fijó gba Awà; a fìjà gba Awà; bí a ò bá jó, ..."
4,We lift a saddle and the goat (kin) scowls; it...,A gbé gàárì ọmọ ewúrẹ́ ń rojú; kì í ṣe ẹrù àgù...


In [34]:
# Rename the columns to match the expected format
val_dataset.rename(columns={'English': 'input_text', 'Yoruba': 'labels'}, inplace=True)
val_dataset.head()

Unnamed: 0,input_text,labels
0,English,Yoruba
1,"We prepare the saddle, and the goat presents i...",A di gàárì sílẹ̀ ewúrẹ́ ń yọjú; ẹrù ìran rẹ̀ ni?
2,"You have been crowned a king, and yet you make...",A fi ọ́ jọba ò ń ṣàwúre o fẹ́ jẹ Ọlọ́run ni?
3,By dancing we take possession of Awà; through ...,"A fijó gba Awà; a fìjà gba Awà; bí a ò bá jó, ..."
4,We lift a saddle and the goat (kin) scowls; it...,A gbé gàárì ọmọ ewúrẹ́ ń rojú; kì í ṣe ẹrù àgù...


In [35]:
train_dataset[0:1]

Unnamed: 0,input_text,labels
0,English,Yoruba


In [36]:
test_dataset.head()

Unnamed: 0,English,Yoruba
0,English,Yoruba
1,Pending the time she would finally pack and go...,Títí di ìgbà tí ó máa fi kó ẹrù rẹ̀ lọ pátápát...
2,She knew how best she was going to take care o...,Ó mọ bí ó ṣe má a tọ́jú ara rẹ̀ àti Tinú.
3,Alamu Should learn to look after himself.,Kí Àlàmú kọ́ bí ó ṣe máa tọ́jú ara rẹ̀.
4,His old Mama should not come back again and be...,Kí ìyá rẹ̀ má tún padà wá láti máa jágbe kí ó ...


In [38]:
# Rename the columns to match the expected format
test_dataset.rename(columns={'English': 'input_text', 'Yoruba': 'labels'}, inplace=True)
test_dataset.head()

Unnamed: 0,input_text,labels
0,English,Yoruba
1,Pending the time she would finally pack and go...,Títí di ìgbà tí ó máa fi kó ẹrù rẹ̀ lọ pátápát...
2,She knew how best she was going to take care o...,Ó mọ bí ó ṣe má a tọ́jú ara rẹ̀ àti Tinú.
3,Alamu Should learn to look after himself.,Kí Àlàmú kọ́ bí ó ṣe máa tọ́jú ara rẹ̀.
4,His old Mama should not come back again and be...,Kí ìyá rẹ̀ má tún padà wá láti máa jágbe kí ó ...


In [None]:
!pip show transformers

Name: transformers
Version: 4.45.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [17]:
model_name = 'google/mt5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [26]:
from normalizer import normalize
from torch.utils.data import Dataset, DataLoader
class Seq2SeqDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.input_text = data['input_text'].apply(normalize).tolist()
        self.labels = data['labels'].apply(normalize).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_text)

    def __getitem__(self, idx):
        input_text = self.input_text[idx]
        label_text = self.labels[idx]

        # Tokenize the input text
        input_encodings = self.tokenizer(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Tokenize the label text to get its 'input_ids' and 'attention_mask'
        label_encodings = self.tokenizer(
            label_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encodings['input_ids'].squeeze(),
            'attention_mask': input_encodings['attention_mask'].squeeze(),
            'labels': label_encodings['input_ids'].squeeze(),
        }

In [27]:
# Modify the data collation process to handle PyTorch tensors correctly
class MyDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    def __call__(self, features):
        batch = {}
        batch["input_ids"] = torch.stack([feature["input_ids"] for feature in features])
        batch["attention_mask"] = torch.stack([feature["attention_mask"] for feature in features])

        # Labels should be processed differently for PyTorch tensors
        if isinstance(features[0]["labels"], torch.Tensor):
            batch["labels"] = torch.stack([feature["labels"] for feature in features])
        else:
            # Convert the list of lists to a PyTorch tensor
            batch["labels"] = torch.tensor([feature["labels"] for feature in features])

        return batch

In [34]:
# Custom Trainer class to ensure tensors are contiguous during training
class CustomSeq2SeqTrainer(Trainer):
    def save_model(self, output_dir=None, **kwargs):
        """Override save_model to ensure all model tensors are contiguous before saving."""
        if output_dir is None:
            output_dir = self.args.output_dir
        for param in self.model.parameters():
            if not param.is_contiguous():
                param.data = param.contiguous()
        super().save_model(output_dir, **kwargs)

    def training_step(self, model, inputs):
        """Override training_step to ensure tensors are contiguous during gradient updates."""
        for param in model.parameters():
            if not param.is_contiguous():
                param.data = param.contiguous()

        return super().training_step(model, inputs)

In [39]:
# Create train , test and validation datasets
train_dataset = Seq2SeqDataset(train_dataset, tokenizer)
val_dataset = Seq2SeqDataset(val_dataset, tokenizer)
test_dataset = Seq2SeqDataset(test_dataset, tokenizer)
# validation_dataset = Seq2SeqDataset(validation_data, tokenizer)

# Create train , test and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)  #batch_size=32
val_dataloader = DataLoader(val_dataset, batch_size=32) #batch_size=32
test_dataloader = DataLoader(test_dataset, batch_size=32) #batch_size=32
# validation_dataloader = DataLoader(validation_dataset, batch_size=32) #batch_size=32

In [21]:
model.to(device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [22]:
# Create a custom optimizer using torch.optim.AdamW
custom_optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-3,
    eps=1e-8,
    weight_decay=0.01,
)

In [35]:
from transformers import Trainer, TrainingArguments
# Define the TrainingArguments for fine-tuning
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=5,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    save_total_limit=0,
    eval_steps=50,
    save_steps=15000,
    learning_rate=1e-3,
    do_train=True,
    do_eval=True,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=False,
    lr_scheduler_type="cosine_with_restarts",
    warmup_steps=100,
    weight_decay=0.01,
    #logging_dir='D:\\Datasets\\Thesis Data Test',
    logging_steps=50,

)



In [36]:
# Create a data collator for sequence-to-sequence tasks
data_collator = MyDataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=False,
    max_length=80,
    label_pad_token_id=tokenizer.pad_token_id,
)

In [37]:
# Create Trainer
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(custom_optimizer, None),
)

In [38]:
trainer.train()

Step,Training Loss,Validation Loss
50,1.5851,1.567281
100,1.4671,1.506593
150,1.3528,1.441184
200,1.3179,1.390289
250,1.283,1.370076


TrainOutput(global_step=251, training_loss=1.400039253956768, metrics={'train_runtime': 911.97, 'train_samples_per_second': 11.043, 'train_steps_per_second': 0.275, 'total_flos': 1327161724108800.0, 'train_loss': 1.400039253956768, 'epoch': 0.9965260545905708})

In [None]:
from transformers import AutoModelForSeq2SeqLM

# Correct directory paths
model_output_dir = "/content/drive/Shareddrives/Bootcamp/Model"
tokenizer_output_dir = "/content/drive/Shareddrives/Bootcamp/Model"

# Save the model to the specified directory
model.save_pretrained(model_output_dir)

# Save the tokenizer to the specified directory
tokenizer.save_pretrained(tokenizer_output_dir)

print(f"Model saved to {model_output_dir}")
print(f"Tokenizer saved to {tokenizer_output_dir}")


In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_output_dir)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_output_dir)

print("Model and tokenizer have been loaded successfully.")




Model and tokenizer have been loaded successfully.


In [11]:
from tqdm import tqdm
import evaluate

def evaluate_model(model, tokenizer, eval_dataloader, device):
    model.eval()  # Set model to evaluation mode
    model.to(device)

    predictions = []
    references = []

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Generate translations
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=80,  # Adjust max_length according to your data
                num_beams=5,  # Beam search for better results
                early_stopping=True
            )

            # Decode predictions
            decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

            # Store results
            predictions.extend(decoded_preds)
            references.extend(decoded_labels)

    return predictions, references

In [None]:
predictions, references = evaluate_model(model, tokenizer, test_dataloader, device)

# Display some sample results
for i in range(5):  # Display first 5 samples
    print(f"Input: {val_dataset.input_text[i]}")
    print(f"Prediction: {predictions[i]}")
    print(f"Reference: {references[i]}")
    print("-" * 30)

# Load the BLEU metric for evaluation using the new library
bleu_metric = evaluate.load("bleu")

# Format predictions and references for BLEU metric calculation
bleu_metric.add_batch(
    predictions=[pred.split() for pred in predictions],
    references=[[ref.split()] for ref in references]
)

# Calculate BLEU score
bleu_score = bleu_metric.compute()
print(f"BLEU Score: {bleu_score['bleu'] * 100:.2f}")

Evaluating:  46%|████▌     | 95/208 [4:28:49<5:23:42, 171.88s/it]