# Training Model From Pre-trained Model

## Dataset

Dataset was cleaned and uploaded to huggingface at `thisisfrantz/haitian-creole-english-train` for the train set and `thisisfrantz/haitian-creole-english-test` for the test set. 

"koman _" -> "koman ou ye"

In [None]:
from datasets import load_dataset

dataset = load_dataset("thisisfrantz/haitian-creole-english-train")

In [None]:
print(dataset)
print(dataset['train'][0])

## Load Pretrained Tokenizer

I wanted to create a custom tokenizer but don't have enough data :( .

In [None]:
from transformers import MarianTokenizer

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ht-en")

In [None]:

example = dataset['train'][0]

source_text = example['lang1']
target_text = example['lang2']
print("Source text:", source_text)
print("Target text:", target_text)

source_tokens = tokenizer(source_text)
source_ids = tokenizer.convert_tokens_to_ids(source_tokens)

# Tokenize target (as target tokenizer)
with tokenizer.as_target_tokenizer():
    target_tokens = tokenizer.tokenize(target_text)
    target_ids = tokenizer.convert_tokens_to_ids(target_tokens)

print("\nSource Tokens:", source_tokens)
print("Source Token IDs:", source_ids)

print("\nTarget Tokens:", target_tokens)
print("Target Token IDs:", target_ids)

In [None]:
# Tokenize the whole dataset
def tokenize_function(example):
    inputs = tokenizer(example['lang1'], truncation=True, padding='max_length', max_length=128)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(example['lang2'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = targets['input_ids']
    return inputs

# preprocess the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

## DataLoader

In [None]:
from torch.utils.data import DataLoader

# PyTorch Format
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Create DataLoader
train_loader = DataLoader(tokenized_dataset['train'], batch_size=8, shuffle=True)

## Full Training Loop

In [None]:

def create_dataloaders(dataset_name, batch_size=8):
    train_dataset = load_dataset(dataset_name + '-train', split='train')
    test_dataset = load_dataset(dataset_name + '-test', split='test')

    # Tokenize the datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

    # Set the format for PyTorch
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_dataloader, test_dataloader

In [None]:
from argparse import Namespace

config = {
    'learning_rate': 5e-5,
    'max_train_steps': 1000,
    'num_warmup_steps': 100,
    'num_train_epochs': 3,
}

args = Namespace(**config)

In [None]:
from torch.optim import AdamW
from transformers import MarianMTModel, MarianTokenizer, get_scheduler

# Load tokenizer & model
model_name = "Helsinki-NLP/opus-mt-ht-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# dataset and autoloader
train_dataloader, test_dataloader = create_dataloaders("thisisfrantz/haitian-creole-english")

# Number of training steps
num_training_steps = len(train_dataloader) * args.num_train_epochs

# Optimizer
optimizer = AdamW(model.parameters(), lr=args.learning_rate)

# Scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=num_training_steps,
)

## Training Loop

In [None]:
from tqdm import tqdm
import torch

# Send model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(args.num_train_epochs):
    model.train()
    total_train_loss = 0

    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    for batch in progress_bar:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Avg Training Loss: {avg_train_loss:.4f}")

## Evaluation

In [None]:
import math
model.eval()
total_val_loss = 0

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        total_val_loss += outputs.loss.item()

avg_val_loss = total_val_loss / len(test_dataloader)
perplexity = math.exp(avg_val_loss)

print(f"Epoch {epoch+1} - Val Loss: {avg_val_loss:.4f} | Perplexity: {perplexity:.2f}")

# Analysis

In [None]:
text = "koman ou ye?"

# tokenize the input text
inputs = tokenizer(text, return_tensors="pt")

# Translation
translation = model.generate(**inputs)

# Decode the generated tokens
translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)
print(f"Input Text: {text}")
print(f"Translated Text: {translated_text}")

In [None]:
text = "Mwen kontan?"

# tokenize the input text
inputs = tokenizer(text, return_tensors="pt")

# Translation for top 5 translations using beam search
translated = model.generate(
    **inputs,
    num_beams=5,
    num_return_sequences=5,
    early_stopping=True
)

# Decode and print the results
translations = []
for i, t in enumerate(translated):
    translated_text = tokenizer.decode(t, skip_special_tokens=True)
    # print(f"Translation {i+1}: {translated_text}")
    translations.append(translated_text)
print(('\n'+'='*80 + '\n').join(translations))