## Device Check

In [1]:
!nvidia-smi

Sun Nov  3 17:54:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1060 ...    Off | 00000000:01:00.0  On |                  N/A |
| N/A   76C    P0              25W /  60W |    556MiB /  6144MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Libs

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import time
import math
import pickle

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from tqdm import tqdm

import os
import os.path as osp
import pytorch_lightning as pl

import random
from rouge_score import rouge_scorer
import sacrebleu
from sacrebleu import corpus_bleu

import sys

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torchinfo
from torchinfo import summary

import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import tokenizers

import warnings
warnings.filterwarnings("ignore")

In [4]:
print("torch:\t\t", torch.__version__)
print("torchinfo:\t", torchinfo.__version__)
print("transformers:\t", transformers.__version__)
print("tokenizers:\t", tokenizers.__version__)
print("Lightning:\t", pl.__version__)
print("sacrebleu:\t", sacrebleu.__version__)

torch:		 2.1.1
torchinfo:	 1.8.0
transformers:	 4.46.1
tokenizers:	 0.20.1
Lightning:	 2.4.0
sacrebleu:	 2.4.3


## Local modules

In [5]:
!ls ..

data  logs  models  notebooks  README.md  src  tests


In [6]:
sys.path.append("../src/")

In [7]:
from config import Config
from dataset import get_dataloader
# from translator import DyulaTranslator

In [8]:
os.environ["TOKENIZERS_PARALLELISM"]= "False"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

## Reproducibility

In [9]:
seed = Config.SEED

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

if Config.DEVICE =="cuda":
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

print(f"Seed set to {seed}")

Seed set to 2024


## Dataset Preparation & Preprocessing

In [10]:
train = pd.read_csv(osp.join(Config.DATA_DIR, "preprocessed/train.csv"))
valid = pd.read_csv(osp.join(Config.DATA_DIR, "preprocessed/valid.csv"))
test = pd.read_csv(osp.join(Config.DATA_DIR, "preprocessed/test.csv"))

In [11]:
train.head()

Unnamed: 0,dyu,fr,dyu_len,fr_len
0,a bi ji min na,il boit de l’eau,14,16
1,a le dalakolontɛ lon bɛ,il se plaint toujours,23,21
2,mun fɛn dɔ,quoi quelque chose,10,18
3,o bɛ bi bɔra fo gubeta,tous sortent excepté gubetta,22,28
4,a ale lo bi da bugɔ la,ah c’est lui… il sonne…,22,23


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8065 entries, 0 to 8064
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dyu      8065 non-null   object
 1   fr       8065 non-null   object
 2   dyu_len  8065 non-null   int64 
 3   fr_len   8065 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 252.2+ KB


In [13]:
valid.head()

Unnamed: 0,dyu,fr,dyu_len,fr_len
0,i tɔgɔ bi cogodɔ,tu portes un nom de fantaisie,16,29
1,puɛn saba fɔlɔ,trois points d’avance,14,21
2,tile bena,le soleil s’est couché,9,22
3,cogoya kelen,mêmes mouvements,12,16
4,n ma daraka dun ban,je n’ai pas encore déjeuné,19,26


In [14]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1471 entries, 0 to 1470
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dyu      1471 non-null   object
 1   fr       1471 non-null   object
 2   dyu_len  1471 non-null   int64 
 3   fr_len   1471 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 46.1+ KB


In [15]:
# Initialize tokenizer and dataloaders
tokenizer = AutoTokenizer.from_pretrained(Config.BACKBONE_MODEL_NAME)

## Dataloaders

In [16]:
train_dataloader = get_dataloader(train, tokenizer, is_train=True)
val_dataloader = get_dataloader(valid, tokenizer, is_train=True)

In [17]:
for i, batch in enumerate(train_dataloader):
    inp, mask, tgt = batch.values()
    print(inp.shape)
    print(mask.shape)
    print(tgt.shape)
    break
    

torch.Size([2, 32])
torch.Size([2, 32])
torch.Size([2, 32])


## Modeling

In [18]:
class DyulaTranslator(pl.LightningModule):
    def __init__(self, tokenizer, model_name:str, learning_rate:float):
        super().__init__()
        self.translator = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.tokenizer = tokenizer
        self.learning_rate = learning_rate
        self.rouge_scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

        self.validation_step_outputs = []
        self.validation_step_targets = []

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.translator(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits

    def training_step(self, batch, batch_idx):
        loss, _ = self.forward(batch['input_ids'], batch['attention_mask'], batch['labels'])
        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        # Forward pass and loss calculation
        loss, _ = self.forward(batch['input_ids'], batch['attention_mask'], batch['labels'])
        self.log("val_loss", loss, prog_bar=True, on_step=True, on_epoch=True)

        # Generate predictions
        outputs = self.translator.generate(
            batch['input_ids'], 
            attention_mask=batch['attention_mask'],
            forced_bos_token_id=self.tokenizer.get_lang_id(Config.OUTPUT_LANG)
        )
        preds = [self.tokenizer.decode(g, skip_special_tokens=True) for g in outputs]
        targets = [self.tokenizer.decode(t, skip_special_tokens=True) for t in batch['labels']]

        # Collect predictions and targets for BLEU and ROUGE calculation at epoch end
        outputs = {'preds': preds, 'targets': targets}
        self.validation_step_outputs.append(preds)
        self.validation_step_targets.append(targets)
        
        return {'val_loss': loss, "outputs":outputs}

    def on_validation_epoch_end(self):
        # Aggregate all predictions and targets
        all_preds = sum(self.validation_step_outputs, [])
        all_targets = sum(self.validation_step_targets, [])

        # Calculate BLEU score
        bleu_score = corpus_bleu(all_preds, [all_targets]).score
        self.log("val_bleu", bleu_score, prog_bar=True)

        # Calculate ROUGE scores
        rouge1, rouge2, rougeL = self.compute_rouge_scores(all_preds, all_targets)

        self.log("val_rouge1", rouge1, prog_bar=True)
        self.log("val_rouge2", rouge2, prog_bar=True)
        self.log("val_rougeL", rougeL, prog_bar=True)

        self.validation_step_outputs.clear()  # free memory
        self.validation_step_targets.clear()  # free memory

    def compute_rouge_scores(self, all_preds, all_targets):
        # Calculate ROUGE scores
        rouge1, rouge2, rougeL = 0, 0, 0
        for pred, target in zip(all_preds, all_targets):
            scores = self.rouge_scorer.score(target, pred)
            rouge1 += scores["rouge1"].fmeasure
            rouge2 += scores["rouge2"].fmeasure
            rougeL += scores["rougeL"].fmeasure

        # Average ROUGE scores across the dataset
        rouge1 /= len(all_preds)
        rouge2 /= len(all_preds)
        rougeL /= len(all_preds)

        return rouge1, rouge2, rougeL
        
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

In [19]:
# Initialize model
model = DyulaTranslator(
    model_name=Config.BACKBONE_MODEL_NAME, 
    learning_rate=Config.LR, 
    tokenizer=tokenizer
)

INFO:absl:Using default tokenizer.


In [20]:
type(model)

__main__.DyulaTranslator

In [21]:
summary(model)

Layer (type:depth-idx)                                            Param #
DyulaTranslator                                                   --
├─M2M100ForConditionalGeneration: 1-1                             --
│    └─M2M100Model: 2-1                                           --
│    │    └─M2M100ScaledWordEmbedding: 3-1                        131,186,688
│    │    └─M2M100Encoder: 3-2                                    282,343,424
│    │    └─M2M100Decoder: 3-3                                    332,748,800
│    └─Linear: 2-2                                                131,186,688
Total params: 877,465,600
Trainable params: 877,465,600
Non-trainable params: 0

In [22]:
def translate_text(
    model, 
    tokenizer, 
    dyula_text, 
    max_length=Config.MAX_LENGTH,
    tgt_out:str=Config.OUTPUT_LANG
):
    model.eval()
    # Tokenize input text
    inputs = tokenizer(dyula_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    with torch.no_grad():
        # Generate translation
        outputs = model.translator.generate(
            inputs["input_ids"], 
            max_length=max_length, 
            forced_bos_token_id=tokenizer.get_lang_id(tgt_out)
        )

    # Decode and clean up output
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text


In [23]:
# dyu_txt, fr_txt = train.sample(n=1).values[0, :2]
# dyu_txt, fr_txt

In [24]:
# %%time
# translate_text(model, tokenizer, dyu_txt)

## Experiment

In [25]:
# Training configuration
trainer = pl.Trainer(
    max_epochs=Config.EPOCHS, 
    accelerator="gpu",
    precision=16
)

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [26]:
trainer.fit(model, train_dataloader, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type                           | Params | Mode
---------------------------------------------------------------------
0 | translator | M2M100ForConditionalGeneration | 483 M  | eval
---------------------------------------------------------------------
483 M     Trainable params
0         Non-trainable params
483 M     Total params
1,935.622 Total estimated model params size (MB)
0         Modules in train mode
350       Modules in eval mode


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

In [None]:
gdr

## Evaluation

In [None]:
trainer.validate(model=model, dataloaders=val_dataloader)

## Post-Processing

In [None]:
# dyula_sentence = "Your Dyula sentence here."
# english_translation = translate_text(model, tokenizer, dyula_sentence)
# print("Translation:", english_translation)
