# Training Model From Pre-trained Model

## Dataset

Dataset was cleaned and uploaded to huggingface at `thisisfrantz/haitian-creole-english-train` for the train set and `thisisfrantz/haitian-creole-english-test` for the test set. 

"koman _" -> "koman ou ye"

In [1]:
from datasets import load_dataset

dataset = load_dataset("thisisfrantz/haitian-creole-english-train")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(dataset)
print(dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'lang1', 'lang2'],
        num_rows: 10813
    })
})
{'id': 3042, 'lang1': 'Lidè Kiben an te di, ata John Kennedy dwe cheche fason pou kontoune anbago a.', 'lang2': 'Even John F. Kennedy had to find a way around the embargo, the Cuban leader said.'}


## Load Pretrained Tokenizer

I wanted to create a custom tokenizer but don't have enough data :( .

In [3]:
from transformers import MarianTokenizer

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ht-en")



In [4]:

example = dataset['train'][0]

source_text = example['lang1']
target_text = example['lang2']
print("Source text:", source_text)
print("Target text:", target_text)

source_tokens = tokenizer(source_text)
source_ids = tokenizer.convert_tokens_to_ids(source_tokens)

# Tokenize target (as target tokenizer)
with tokenizer.as_target_tokenizer():
    target_tokens = tokenizer.tokenize(target_text)
    target_ids = tokenizer.convert_tokens_to_ids(target_tokens)

print("\nSource Tokens:", source_tokens)
print("Source Token IDs:", source_ids)

print("\nTarget Tokens:", target_tokens)
print("Target Token IDs:", target_ids)

Source text: Lidè Kiben an te di, ata John Kennedy dwe cheche fason pou kontoune anbago a.
Target text: Even John F. Kennedy had to find a way around the embargo, the Cuban leader said.

Source Tokens: {'input_ids': [116, 16401, 61, 14693, 32, 7, 48, 2, 8611, 424, 29389, 9732, 108, 20212, 113, 14, 300, 2556, 4424, 276, 5887, 8, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Source Token IDs: [1, 1]

Target Tokens: ['▁Even', '▁John', '▁F', '.', '▁Kenne', 'dy', '▁had', '▁to', '▁find', '▁a', '▁way', '▁around', '▁the', '▁emb', 'ar', 'go', ',', '▁the', '▁Cuba', 'n', '▁leader', '▁said', '.']
Target Token IDs: [871, 424, 1316, 3, 29389, 9732, 129, 10, 504, 8, 222, 1293, 6, 30292, 4423, 5887, 2, 6, 22471, 430, 5026, 260, 3]




In [5]:
# Tokenize the whole dataset
def tokenize_function(example):
    inputs = tokenizer(example['lang1'], truncation=True, padding='max_length', max_length=128)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(example['lang2'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = targets['input_ids']
    return inputs

# preprocess the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 10813/10813 [00:05<00:00, 2015.91 examples/s]


## DataLoader

In [6]:
from torch.utils.data import DataLoader

# PyTorch Format
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Create DataLoader
train_loader = DataLoader(tokenized_dataset['train'], batch_size=8, shuffle=True)

## Full Training Loop

In [11]:

def create_dataloaders(dataset_name, batch_size=8):
    train_dataset = load_dataset(dataset_name + '-train', split='train')
    test_dataset = load_dataset(dataset_name + '-test', split='test')

    # Tokenize the datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Set the format for PyTorch
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_dataloader, test_dataloader

In [12]:
from argparse import Namespace

config = {
    'learning_rate': 5e-5,
    'max_train_steps': 1000,
    'num_warmup_steps': 100,
    'num_train_epochs': 3,
}

args = Namespace(**config)

In [14]:
from torch.optim import AdamW
from transformers import MarianMTModel, MarianTokenizer, get_scheduler

# Load tokenizer & model
model_name = "Helsinki-NLP/opus-mt-ht-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# dataset and autoloader
train_dataloader, test_dataloader = create_dataloaders("thisisfrantz/haitian-creole-english")

# Number of training steps
num_training_steps = len(train_dataloader) * args.num_train_epochs

# Optimizer
optimizer = AdamW(model.parameters(), lr=args.learning_rate)

# Scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=num_training_steps,
)

Repo card metadata block was not found. Setting CardData to empty.
Map: 100%|██████████| 10813/10813 [00:06<00:00, 1791.79 examples/s]
Map: 100%|██████████| 2704/2704 [00:01<00:00, 2004.78 examples/s]


## Training Loop

In [15]:
from tqdm import tqdm
import torch

# Send model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(args.num_train_epochs):
    model.train()
    total_train_loss = 0

    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    for batch in progress_bar:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Avg Training Loss: {avg_train_loss:.4f}")

Epoch 1: 100%|██████████| 1352/1352 [44:36<00:00,  1.98s/it, loss=0.58] 


Epoch 1 - Avg Training Loss: 1.1024


Epoch 2: 100%|██████████| 1352/1352 [18:20:50<00:00, 48.85s/it, loss=0.707]       


Epoch 2 - Avg Training Loss: 0.8171


Epoch 3: 100%|██████████| 1352/1352 [43:48<00:00,  1.94s/it, loss=0.365]

Epoch 3 - Avg Training Loss: 0.7351





## Evaluation

In [16]:
import math
model.eval()
total_val_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        total_val_loss += outputs.loss.item()

avg_val_loss = total_val_loss / len(test_dataloader)
perplexity = math.exp(avg_val_loss)

print(f"Epoch {epoch+1} - Val Loss: {avg_val_loss:.4f} | Perplexity: {perplexity:.2f}")

Epoch 3 - Val Loss: 0.8019 | Perplexity: 2.23
