In [31]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

from pathlib import Path

import torchmetrics
from torch.utils.tensorboard import SummaryWriter
from torchviz import make_dot
from tqdm import tqdm

In [2]:
import sys
sys.path.append('../scripts')

from model import build_transformer
from utils import count_parameters
from config import get_config, get_weights_file_path, latest_weights_file_path

In [3]:
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

In [4]:
# Example dataset
dataset = [
    {'id': 1, 'translation': {'en': 'Hello', 'fr': 'Bonjour'}},
    {'id': 2, 'translation': {'en': 'Goodbye', 'fr': 'Au revoir'}}
]

# Define the generator function
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

# Usage
lang = 'fr'
for sentence in get_all_sentences(dataset, lang):
    print(sentence)

Bonjour
Au revoir


In [5]:
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens = ["[UNK]", "[PAD]", "[SOS]", "[EOS]"])
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer


In [6]:
# Example config
config = {
    'tokenizer_file': 'tokenizer_{}.json'
}

# Example dataset
dataset = [
    {'id': 1, 'translation': {'en': 'Hello world', 'fr': 'Bonjour tout le monde'}},
    {'id': 2, 'translation': {'en': 'How are you?', 'fr': 'Comment ça va ?'}}
]

# Example usage
lang = 'fr'
tokenizer = get_or_build_tokenizer(config, dataset, lang)
print(tokenizer.encode("Bonjour tout le monde yash").tokens)
print(tokenizer.encode("Bonjour tout le monde yash").ids)

['Bonjour', 'tout', 'le', 'monde', '[UNK]']
[5, 9, 7, 8, 0]


In [7]:
class BilingualDataset(Dataset):
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len

        # since all these token for both lang remain same we can use tokenizer_tgt or tokenizer_src
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    @staticmethod
    def causal_mask(size):
        mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
        return mask == 0

    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, index):
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]
        
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # SOS and EOS
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # no EOS or SOS

        if enc_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
        
        # add SOS and EOS to the source text
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype = torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype = torch.int64)
            ]   
        )

        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype = torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64)
            ]   
        )

        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype = torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64)
            ]   
        )

        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input, #seq_len
            "decoder_input": decoder_input, #seq_len
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & BilingualDataset.causal_mask(decoder_input.size(0)), # (1,seq_len) & (1, seq_len, seq_len)
            "label": label,
            "src_text": src_text,
            "tgt_text": tgt_text
        }

In [8]:
# Example dataset
dataset = [
    {'id': 1, 'translation': {'en': 'Hello world', 'fr': 'Bonjour tout le monde'}},
    {'id': 2, 'translation': {'en': 'How are you?', 'fr': 'Comment ça va ?'}}
]

# Example config
config = {
    'tokenizer_file': 'tokenizer_{0}.json'
}

tokenizer_src = get_or_build_tokenizer(config, dataset, "en")
tokenizer_tgt = get_or_build_tokenizer(config, dataset, "fr")

# Example usage
lang = 'fr'
bilingual_dataset = BilingualDataset(dataset, tokenizer_src, tokenizer_tgt, 'en', lang, 10)

# Create a DataLoader for batching the data
dataloader = DataLoader(bilingual_dataset, batch_size=1, shuffle=True)

# Iterate over the DataLoader to print batches
for batch in dataloader:
    print("Encoder Input:", batch["encoder_input"].shape)
    print("Decoder Input:", batch["decoder_input"].shape)
    print("Encoder Mask:", batch["encoder_mask"].shape)
    print("Decoder Mask:", batch["decoder_mask"].shape)
    print("Label:", batch["label"].shape)
    print("Source Text:", batch["src_text"])
    print("Target Text:", batch["tgt_text"])
    print("---")

Encoder Input: torch.Size([1, 10])
Decoder Input: torch.Size([1, 10])
Encoder Mask: torch.Size([1, 1, 10])
Decoder Mask: torch.Size([1, 1, 10, 10])
Label: torch.Size([1, 10])
Source Text: ['Hello world']
Target Text: ['Bonjour tout le monde']
---
Encoder Input: torch.Size([1, 10])
Decoder Input: torch.Size([1, 10])
Encoder Mask: torch.Size([1, 1, 10])
Decoder Mask: torch.Size([1, 1, 10, 10])
Label: torch.Size([1, 10])
Source Text: ['How are you?']
Target Text: ['Comment ça va ?']
---


In [9]:
def get_ds(config):
    ds_raw = load_dataset('opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split='train')
    # build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])
    # keep 90% for training and 10% for validation
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(
        train_ds_raw, 
        tokenizer_src, 
        tokenizer_tgt, 
        config['lang_src'], 
        config['lang_tgt'], 
        config['seq_len']
    )
    val_ds = BilingualDataset(
        val_ds_raw, 
        tokenizer_src, 
        tokenizer_tgt, 
        config['lang_src'], 
        config['lang_tgt'], 
        config['seq_len']
    )

    max_len_src = 0
    max_len_tgt = 0
    
    for item in ds_raw:
        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_src']]).ids
        
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))

    print(f"Max length for source senstences: {max_len_src}")
    print(f"Max length for target senstences: {max_len_tgt}")

    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=False)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt
    

In [10]:
config = {
    'lang_src': 'en',
    'lang_tgt': 'it',
    'seq_len': 500,
    'batch_size': 32,
    'tokenizer_file': 'tokenizer_{0}.json'
}

train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

# Print some information about the datasets and tokenizers
print(f"Train Dataset Size: {len(train_dataloader.dataset)}")
print(f"Validation Dataset Size: {len(val_dataloader.dataset)}")
print(f"Source Language Tokenizer Vocabulary Size: {len(tokenizer_src.get_vocab())}")
print(f"Target Language Tokenizer Vocabulary Size: {len(tokenizer_tgt.get_vocab())}")

# Iterate over a few batches from the dataloaders to check the data
for batch in train_dataloader:
    print("Encoder Input Shape:", batch["encoder_input"].shape)
    print("Decoder Input Shape:", batch["decoder_input"].shape)
    print("Encoder Mask Shape:", batch["encoder_mask"].shape)
    print("Decoder Mask Shape:", batch["decoder_mask"].shape)
    print("Label Shape:", batch["label"].shape)
    print("---")
    break  # Stop after printing the first batch

Max length for source senstences: 309
Max length for target senstences: 309
Train Dataset Size: 29098
Validation Dataset Size: 3234
Source Language Tokenizer Vocabulary Size: 25138
Target Language Tokenizer Vocabulary Size: 30000
Encoder Input Shape: torch.Size([32, 500])
Decoder Input Shape: torch.Size([32, 500])
Encoder Mask Shape: torch.Size([32, 1, 500])
Decoder Mask Shape: torch.Size([32, 1, 500, 500])
Label Shape: torch.Size([32, 500])
---


In [11]:
def get_model(config, vocab_src_len, vocab_tgt_len):
    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    return model

In [12]:
config  = {
    'seq_len': 500,
    'd_model': 512,
}
vocab_src_len = 512
vocab_tgt_len = 512

model = get_model(config, vocab_src_len, vocab_tgt_len)


In [16]:
# def train_model(config):
# Define the Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using Device:", device)

# Make sure the weights folder exists
Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)

train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

model = get_model(
    config, 
    tokenizer_src.get_vocab_size(), 
    tokenizer_tgt.get_vocab_size()
).to(device)

# Tensorboard
writer = SummaryWriter(config['experiment_name'])

optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # If the user specified a model to preload before training, load it
initial_epoch = 0
global_step = 0
preload = config['preload']
model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None

print(model_filename)


Using Device: cuda
Max length for source senstences: 309
Max length for target senstences: 309
None


In [17]:
if model_filename:
    print(f'Preloading model {model_filename}')
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])
    initial_epoch = state['epoch'] + 1
    optimizer.load_state_dict(state['optimizer_state_dict'])
    global_step = state['global_step']
else:
    print('No model to preload, starting from scratch')

No model to preload, starting from scratch


In [18]:
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

In [27]:
# Example tensors
logits = torch.tensor([[2.0, 0.5, -0.5], [0.1, 1.0, -1.0], [0.2, 0.3, 0.8]], dtype=torch.float32)
targets = torch.tensor([0, 1, 2], dtype=torch.int64)

# Calculate the loss
loss = loss_fn(logits, targets)

In [33]:
epoch = 0
batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")

Processing Epoch 00:   0%|          | 0/3638 [00:00<?, ?it/s]

In [45]:
total_batches = 0
for batch in tqdm(batch_iterator):
    print(batch.keys())
    print(f"encoder_input_{batch['encoder_input'].shape}")
    print(f"decoder_input_{batch['decoder_input'].shape}")
    print(f"encoder_mask_{batch['encoder_mask'].shape}")
    print(f"decoder_mask{batch['decoder_mask'].shape}")
    print(f"label{batch['label'].shape}")
    print(f"src_text_{len(batch['src_text'])}")
    print(f"tgt_text_{len(batch['tgt_text'])}")
    break

  0%|          | 0/3638 [00:00<?, ?it/s]

dict_keys(['encoder_input', 'decoder_input', 'encoder_mask', 'decoder_mask', 'label', 'src_text', 'tgt_text'])
encoder_input_torch.Size([8, 350])
decoder_input_torch.Size([8, 350])
encoder_mask_torch.Size([8, 1, 350])
decoder_masktorch.Size([8, 1, 350, 350])
labeltorch.Size([8, 350])
src_text_8
tgt_text_8





In [46]:
# for epoch in range(initial_epoch, config['num_epochs']):
    
#     torch.cuda.empty_cache()
#     model.train()
#     batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
    
#     for batch in batch_iterator:

#         encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
#         decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
#         encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
#         decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

#         # Run the tensors through the encoder, decoder and the projection layer
#         encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
#         decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
#         proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

#         # Compare the output with the label
#         label = batch['label'].to(device) # (B, seq_len)

#         # Compute the loss using a simple cross entropy
#         loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
#         batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

#         # Log the loss
#         writer.add_scalar('train loss', loss.item(), global_step)
#         writer.flush()

#         # Backpropagate the loss
#         loss.backward()

#         # Update the weights
#         optimizer.step()
#         optimizer.zero_grad(set_to_none=True)

#         global_step += 1

Processing Epoch 00:   0%|          | 0/3638 [00:06<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
