In [27]:
import nltk
import evaluate
import numpy as np
from nltk.tokenize import sent_tokenize
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [28]:
nltk.download("punkt", quiet=True)

True

In [29]:
MODEL_NAME = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [62]:
from datasets import load_dataset
from datasets import Dataset

In [31]:
squad_dataset = load_dataset("squad")

In [32]:
print(squad_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})


In [26]:
# squad_dataset['train']['answer_start'] = squad_dataset['train']['answers']['answer_start']
# squad_dataset['train']['answer_end'] = squad_dataset['train']['answers']['answer_start'] + len(squad_dataset['train']['answers']['answer'])
# squad_dataset['train']['answer'] = squad_dataset['train']['answers']['text']

In [50]:
def tokenize_function(examples, padding="max_length"):
    prompt = "Please answer the following question by reasoning step by step: "
    inputs = [prompt + item for item in examples['question']]
    model_inputs = tokenizer(inputs, max_length=512, padding=padding, truncation=True)
    text_target = [answer["text"][0] for answer in examples["answers"]]
    labels = tokenizer(text_target=text_target, max_length=512, padding=padding, truncation=True)

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else - 100) for l in label] for label in labels['input_ids']
        ]
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [51]:
tokenized_dataset = squad_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 87599/87599 [03:22<00:00, 433.15 examples/s]
Map: 100%|██████████| 10570/10570 [00:21<00:00, 486.73 examples/s]


In [52]:
rouge_metric = evaluate.load("rouge")

def postprocess_text(preds,labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds,labels

In [53]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds,tuple):
        preds = preds[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds,decoded_labels = postprocess_text(decoded_preds,decoded_labels)

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result


In [56]:

label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8)

In [57]:
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 2
NUM_EPOCHS = 3

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="model_folder",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
    predict_with_generate=True,
    fp16=False, 
    learning_rate=L_RATE,
    num_train_epochs=NUM_EPOCHS,
    # logging & evaluation strategies
    logging_dir=f"model_folder/answers",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=SAVE_TOTAL_LIM,
    load_best_model_at_end=True,
    report_to="tensorboard",
    push_to_hub=False,
)


In [58]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)

In [60]:
trainer.train()

  0%|          | 0/32850 [02:08<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 6.71 GB, other allocations: 83.57 MB, max allowed: 6.77 GB). Tried to allocate 16.00 KB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).