In [1]:
pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting huggingface_hub
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [2]:
import nltk
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from datasets import Dataset, DatasetDict

# Create a dictionary containing the questions and answers
data_dict = {
    "question": [
        "What is your name?",
        "How old are you?",
        "Where are you from?"
    ],
    "answer": [
        "My name is John.",
        "I am 30 years old.",
        "I am from New York."
    ]
}

# Create a dataset from the dictionary
dataset = Dataset.from_dict(data_dict)

# Split the dataset into train and test sets
train_dataset = dataset.select([0, 1])
test_dataset = dataset.select([2])

# Create a DatasetDict
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})


In [8]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 2
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1
    })
})

In [9]:
base = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(base)
tokenizer = T5Tokenizer.from_pretrained(base)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 246kB/s]
Downloading model.safetensors: 100%|██████████| 892M/892M [01:17<00:00, 11.5MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 24.7kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.85MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 9.35MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legac

In [23]:
# prefix our task
prefix1 = "rationale + "
prefix2 = "label + "

# define preprocessing function
def preprocess_function(examples):
  """Add prefix to the sentences, tokenize the text, and set the labels"""
  # the "inputs" are the tokenized answer
  inputs = [prefix1 + doc for doc in examples["question"]]
  model_inputs = tokenizer(inputs, max_length=128, truncation=True)

  # the "labels" are the tokenized outputs
  labels = [prefix2 + doc for doc in examples["answer"]]
  labels_token = tokenizer(labels, max_length=512, truncation=True)
  model_inputs["labels"] = labels_token["input_ids"]
  return model_inputs

# Map the preprocessing function across our datasets
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)


Map: 100%|██████████| 2/2 [00:00<00:00, 477.90 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 258.30 examples/s]


In [25]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})

In [12]:
# Use Rougue score to evaluate the training process
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
  preds, labels = eval_preds

  #decode preds and labels
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  #rougeLSum expects newline after each sentence
  decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

  result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  return result

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 3.86MB/s]


In [13]:
# set up training arguments: for GPU with 16GB VRAM
training_args = Seq2SeqTrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    push_to_hub=False
)

In [26]:
# set up trainer
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [27]:
trainer.train()

                                             
 50%|█████     | 1/2 [00:02<00:01,  1.84s/it]

{'eval_loss': 1.3975639343261719, 'eval_rouge1': 0.1818181818181818, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1818181818181818, 'eval_rougeLsum': 0.1818181818181818, 'eval_runtime': 0.5976, 'eval_samples_per_second': 1.673, 'eval_steps_per_second': 1.673, 'epoch': 1.0}


                                             
100%|██████████| 2/2 [00:04<00:00,  2.32s/it]

{'eval_loss': 1.0151854753494263, 'eval_rouge1': 0.1818181818181818, 'eval_rouge2': 0.0, 'eval_rougeL': 0.1818181818181818, 'eval_rougeLsum': 0.1818181818181818, 'eval_runtime': 0.5788, 'eval_samples_per_second': 1.728, 'eval_steps_per_second': 1.728, 'epoch': 2.0}
{'train_runtime': 4.6405, 'train_samples_per_second': 0.862, 'train_steps_per_second': 0.431, 'train_loss': 1.6592155694961548, 'epoch': 2.0}





TrainOutput(global_step=2, training_loss=1.6592155694961548, metrics={'train_runtime': 4.6405, 'train_samples_per_second': 0.862, 'train_steps_per_second': 0.431, 'train_loss': 1.6592155694961548, 'epoch': 2.0})