In [25]:
!pip install transformers[sentencepiece]==4.28.0 datasets sacrebleu evaluate

[0m

In [26]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value = user_secrets.get_secret("HF_KEY")
login(secret_value)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [27]:
import transformers

print(transformers.__version__)

4.28.0


In [28]:
from datasets import load_metric
metric = load_metric("sacrebleu")

In [29]:
training_data = load_dataset("emea", lang1="en", lang2="fr")

  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
training_data = training_data['train'].train_test_split(test_size=0.2)

In [31]:
from transformers import AutoTokenizer
checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [32]:
source_lang = "en"
target_lang = "fr"
def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]] 
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [33]:
tokenized_train = training_data.map(preprocess_function, batched=True)

  0%|          | 0/875 [00:00<?, ?ba/s]

  0%|          | 0/219 [00:00<?, ?ba/s]

KeyboardInterrupt: 

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import numpy as np

def postprocess_text(preds, labels): 
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) #Convert back into words

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) #Ignore padded labels added by the data collator to the test set
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) #Remove leading and trailing spaces

    result = metric.compute(predictions=decoded_preds, references=decoded_labels) #BLEU score for provided input and references
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens) #Compute mean prediction length
    result = {k: round(v, 4) for k, v in result.items()} #Round score to 4dp
    return result

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
import torch

training_args = Seq2SeqTrainingArguments( #Collects hyperparameters
    output_dir="marian_prelim_emea_enfr",
    evaluation_strategy="epoch", #Evaluates at the end of each epoch
    learning_rate=2e-5, #Initial learning rate for AdamW
    per_device_train_batch_size=16, #Minibatch learning
    per_device_eval_batch_size=16, #Batch size for evaluation
    weight_decay=0.01, #Weight decay for loss computation; Loss = Loss + WD * sum (weights squared)
    save_total_limit=3, #Number of checkpoints to save
    num_train_epochs=2,
    predict_with_generate=True, #Use with ROUGE/BLEU and other translation metrics (see below)
    fp16=True, #Remove fp16 = True if not using CUDA
    push_to_hub=True,
)

trainer = Seq2SeqTrainer( #Saves us from writing our own training loops
    model=model,
    args=training_args,
    train_dataset=tokenized_train["train"],
    eval_dataset=tokenized_train["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()