In [60]:
import numpy as np
import multiprocessing

import evaluate
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import load_dataset
from accelerate import Accelerator

In [2]:
num_cores_avail = max(1, multiprocessing.cpu_count() - 1)

# Dataset

In [3]:
dataset_checkpoint = "kde4"
dataset_commit_id = "12cd06d961fae220f6ef1ab533321b8e9ddc3533"

In [4]:
raw_datasets = load_dataset(dataset_checkpoint, lang1="en", lang2="fr", revision=dataset_commit_id)

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

## Split dataset

In [6]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")

In [7]:
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

# Model

In [8]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
model_commit_id = "0b11e64c9efac19014daf61fb333354e09000f00"

In [9]:
translator = pipeline("translation", model=model_checkpoint, revision=model_commit_id)

## Translation differences
The pre-trained model tends to use less formal translations (e.g., keeps certain english terms in the translation)

In [10]:
split_datasets["train"][1]["translation"]

{'en': 'Default to expanded threads',
 'fr': 'Par défaut, développer les fils de discussion'}

In [11]:
translator(split_datasets["train"][1]["translation"]["en"])

[{'translation_text': 'Par défaut pour les threads élargis'}]

In [12]:
split_datasets["train"][172]["translation"]

{'en': 'Unable to import %1 using the OFX importer plugin. This file is not the correct format.',
 'fr': "Impossible d'importer %1 en utilisant le module d'extension d'importation OFX. Ce fichier n'a pas un format correct."}

In [13]:
translator(split_datasets["train"][172]["translation"]["en"])

[{'translation_text': "Impossible d'importer %1 en utilisant le plugin d'importateur OFX. Ce fichier n'est pas le bon format."}]

# Tokenization

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, revision=model_commit_id, return_tensors="pt")

In [15]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]
inputs = tokenizer(en_sentence, text_target=fr_sentence)

In [16]:
inputs

{'input_ids': [47591, 12, 9842, 19634, 9, 0], 'attention_mask': [1, 1, 1, 1, 1, 1], 'labels': [577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]}

In [17]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))

['▁Par', '▁défaut', ',', '▁développer', '▁les', '▁fils', '▁de', '▁discussion', '</s>']
['▁Par', '▁dé', 'f', 'aut', ',', '▁dé', 've', 'lop', 'per', '▁les', '▁fil', 's', '▁de', '▁discussion', '</s>']


In [18]:
max_length = 128

In [19]:
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [20]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
    num_proc=num_cores_avail
)

# Model

In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, revision=model_commit_id)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [22]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])

In [23]:
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [24]:
# Check for appropriate padding of labels
batch["labels"]

tensor([[  577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
           550,  7032,  5821,  7907, 12649,     0]])

In [25]:
# These should be right-shifted versions of the labels
batch["decoder_input_ids"]

tensor([[59513,   577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,
         59513, 59513, 59513, 59513, 59513, 59513],
        [59513,  1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,
           817,   550,  7032,  5821,  7907, 12649]])

In [26]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]
[1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817, 550, 7032, 5821, 7907, 12649, 0]


# Metrics

In [27]:
metric = evaluate.load("sacrebleu")

In [28]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]

In [29]:
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [30]:
predictions = ["This This This This"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]

In [31]:
metric.compute(predictions=predictions, references=references)

{'score': 1.683602693167689,
 'counts': [1, 0, 0, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [25.0, 16.666666666666668, 12.5, 12.5],
 'bp': 0.10539922456186433,
 'sys_len': 4,
 'ref_len': 13}

In [32]:
predictions = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]

In [33]:
metric.compute(predictions=predictions, references=references)

{'score': 0.0,
 'counts': [2, 1, 0, 0],
 'totals': [2, 1, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 0.004086771438464067,
 'sys_len': 2,
 'ref_len': 13}

In [34]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

# Model fine-tuning

In [35]:
model_checkpoint

'Helsinki-NLP/opus-mt-en-fr'

In [48]:
args = Seq2SeqTrainingArguments(
    "../temp/07/marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False
)

## Train and evaluate with subsample
Only use a subsample here, otherwise it takes too long

In [42]:
n_train_samp = int(0.1 * tokenized_datasets["train"].num_rows)
n_eval_samp = int(0.1 * tokenized_datasets["validation"].num_rows)

## Evaluate at start

In [49]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(n_train_samp)),
    eval_dataset=tokenized_datasets["validation"].shuffle(seed=42).select(range(n_eval_samp)),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [46]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 1.7501263618469238,
 'eval_bleu': 38.24386544642335,
 'eval_runtime': 336.2797,
 'eval_samples_per_second': 6.248,
 'eval_steps_per_second': 0.098}

## Train (fine-tune)

In [50]:
trainer.train()



Step,Training Loss
500,1.0449
1000,0.929
1500,0.8802


TrainOutput(global_step=1776, training_loss=0.9326582985955316, metrics={'train_runtime': 677.6316, 'train_samples_per_second': 83.74, 'train_steps_per_second': 2.621, 'total_flos': 1149328300179456.0, 'train_loss': 0.9326582985955316, 'epoch': 3.0})

In [51]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 1.119015097618103,
 'eval_bleu': 43.30220407438073,
 'eval_runtime': 264.751,
 'eval_samples_per_second': 7.936,
 'eval_steps_per_second': 0.125,
 'epoch': 3.0}

# Train and evaluate model with Accelerate & custom loop

## Dataloaders and model

In [53]:
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(
    tokenized_datasets["train"].shuffle(seed=42).select(range(n_train_samp)),
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"].shuffle(seed=42).select(range(n_eval_samp)),
    collate_fn=data_collator,
    batch_size=8
)

### Load initial pretrained model

In [54]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

### Set up optimizers and accelerator

In [59]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [61]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)