# Install Libraries

In [None]:
# ! pip install transformers datasets evaluate
# ! pip install sacrebleu
# ! pip install googletrans==3.1.0a0

In [1]:
from datasets import load_dataset, DatasetDict, Dataset, load_from_disk
import evaluate
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import pipeline
from googletrans import Translator

  from .autonotebook import tqdm as notebook_tqdm


# Loading Dataset

In [2]:
dataset = load_from_disk ('./dataset/aslg_pc12')

# Training Model

Initializing pretrained tokenizer for pretrained model T5. This tokenizer is constructed based on SentencePiece.

SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements subword units (e.g., byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model [Kudo.]) with the extension of direct training from raw sentences.

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

Clarifying source and target language which, in this problem, is from Gloss Language to English

Then, tokenizing the dataset by using map method; meaning that each instance in the dataset will be tokenized and appended into `tokenized_data`. To speed up the map function, I set `batched=True` to process multiple elements of the dataset at once

In [None]:
source_lang = "gloss"
target_lang = "text"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs,  text_target=targets, max_length=128, truncation=True)
    print(type(model_inputs))
    return model_inputs

In [None]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Set up the evaluation metrics which is `SacreBLEU`.

For each epoch in training process, both the model's predicted output and the expected output will be decoded. Then pass decoded predictions and results to calculate SacreBLEU score

In [None]:
sacrebleu = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Load T5 with AutoModelForSeq2SeqLM and create a batch of examples using DataCollatorForSeq2Seq

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Process the three steps:

1. Defining training hyperparameters in Seq2SeqTrainingArguments, including the output_dir that specifies where to save the model and other parameters such as number of training epoches, batch size, learning rate, etc. At the end of each epoch, the Trainer will evaluate the SacreBLEU metric and save the training checkpoint.
2. Pass the training arguments to Seq2SeqTrainer along with the model, dataset, tokenizer, data collator, and compute_metrics function.
3. Call train() to finetune the model.

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="transformer_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Push model to Hugging Face Hub

In [None]:
# trainer.push_to_hub()