# Finetuning FLAN-T5

---

I referenced and adapted Hugging Face's [Translation Task Guide](https://huggingface.co/docs/transformers/tasks/translation).

## Install and import packages

In [None]:
!pip install -qU transformers[torch]
!pip install -qU evaluate
!pip install -qU sacrebleu
!pip install -q --upgrade accelerate
!pip install -qU huggingface_hub
!pip install -qU datasets
!pip show accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [None]:
import huggingface_hub
import pandas as pd
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import accelerate
import torch
from datasets import Dataset

In [None]:
huggingface_hub.login(token="ENTER-WRITE-TOKEN",
                      add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Tokenize the finetuning dataset

---

First, we need to prepare the data to be tokenized. This includes adding a translation instruction prefix and splitting our dataset to a train and split dataset

In [None]:
# Import English-Hawaiian Pidgin parallel corpus and reshuffle rows
eng_hwp_df = pd.read_csv("./kjv_hwp.csv")
eng_hwp_df = eng_hwp_df.dropna().sample(frac=1).reset_index(drop=True)

# Add translation prefix before English input
def add_prefix(text):
  full_text = "translate English to Hawaiian Pidgin: " + text
  return full_text

eng_hwp_df["eng"] = eng_hwp_df["eng"].apply(add_prefix)
eng_hwp_df.head()

# Convert from dataframe to Hugging Face dataset
eng_hwp_ds = Dataset.from_pandas(eng_hwp_df)
# 15% of the dataset will be used to test
eng_hwp_ds = eng_hwp_ds.train_test_split(test_size=0.15)

Now we import the tokenizer and see how the texts are tokenized:

In [None]:
checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

(…)-base/resolve/main/tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

(…)flan-t5-base/resolve/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

(…)ase/resolve/main/special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
sample = eng_hwp_ds["train"][1]
eng = tokenizer(sample["eng"])
hwp = tokenizer(sample["hwp"])

print(tokenizer.convert_ids_to_tokens(eng["input_ids"]))
print(tokenizer.convert_ids_to_tokens(hwp["input_ids"]))

['▁translate', '▁English', '▁to', '▁Hawaiian', '▁Pi', 'd', 'g', 'in', ':', '▁And', '▁now', '▁I', '▁stand', '▁and', '▁am', '▁judge', 'd', '▁for', '▁the', '▁hope', '▁of', '▁the', '▁promise', '▁made', '▁of', '▁God', ',', '▁un', 'to', '▁our', '▁father', 's', ':', '</s>']
['▁An', '▁now', '▁I', '▁stay', '▁standing', '▁', 'ova', '▁', 'he', 'a', '▁in', '▁front', '▁you', '▁cu', 'z', '▁I', '▁', 'tru', 's', '▁an', '▁wait', '▁fo', '▁da', '▁stuff', '▁God', '▁we', 'n', '▁promise', '▁our', '▁', 'ance', 'sta', '▁guys', ',', '▁', 'a', 'z', '▁why', '.', '</s>']


I used an English tokenizer for both the English and Hawaiian Pidgin dataset. Overall it does not seem too bad, but it does split words such as "wen" and "hea" into two separate tokens.

Now we move on to actually tokenizing the entire dataset:

In [None]:
def preprocess_function(examples):
  model_inputs = tokenizer(examples["eng"], max_length=128, truncation=True)
  labels = tokenizer(examples["hwp"], max_length=128, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

tokenized_dataset = eng_hwp_ds.map(preprocess_function, batched=True, remove_columns=["eng", "hwp"])

Map:   0%|          | 0/6715 [00:00<?, ? examples/s]

Map:   0%|          | 0/1185 [00:00<?, ? examples/s]

To pad other sentences to the longest sentence in the batch, set up data collator:

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

To calculate how well our model at the end of each epoch, we can call an evaluator. 

In [None]:
metric = evaluate.load("sacrebleu")

# Remove extra whitespace for calculation
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

# Calculate bleu score and length
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

I used sacrebleu for this version of the model, but I also experimented with rouge. This is the code for rouge:

In [None]:
# Must install rouge_score instead of sacrebleu
!pip install -qU rouge_score

metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels,
                            use_aggregator=True)
    result = {"rouge2": result["rouge2"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Now we can train the model!

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

training_args = Seq2SeqTrainingArguments(
    output_dir="flan-t5-base-eng-hwp-kjv",
    evaluation_strategy="epoch",
    learning_rate=3e-4,     # Higher learning rate is recommended with AdamW Optimizer
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15,
    predict_with_generate=True,
    push_to_hub=True,
    hub_private_repo=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

(…)le/flan-t5-base/resolve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

(…)base/resolve/main/generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.607687,3.732,18.8506
2,2.131400,1.457236,4.2893,18.8557
3,1.507900,1.397816,4.6504,18.8599
4,1.294500,1.378834,4.8595,18.8641
5,1.138700,1.384133,4.907,18.8819
6,1.014200,1.377558,5.0933,18.8743
7,1.014200,1.391183,5.1246,18.8726
8,0.902400,1.415824,5.1468,18.8692
9,0.822700,1.440291,5.1846,18.865
10,0.749000,1.468469,5.0892,18.8844




TrainOutput(global_step=6300, training_loss=0.9866964358375186, metrics={'train_runtime': 5834.5908, 'train_samples_per_second': 17.263, 'train_steps_per_second': 1.08, 'total_flos': 9018647532251136.0, 'train_loss': 0.9866964358375186, 'epoch': 15.0})

In [None]:
# Push to Hugging Face Hub
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

'https://huggingface.co/claudiatang/flan-t5-base-eng-hwp-kjv/tree/main/'