In [2]:
templates = [
        ("Please answer this question: {question}", "{answer}"),
        ("{question}", "{answer}"),
        ("Write the answer: {question}", "{answer}"),
        ("What is the answer: {question}", "{answer}"),
        ("Answer this question.\n\n{question}", "{answer}"),
        ("Answer the following question. {question}", "{answer}"),
        ("Question: {question}\nAnswer:", "{answer}"),
        ("{question}???", "{answer}"),
        ("Trivia question: {question}\nAnd the answer is?", "{answer}"),
        ("{question}\nWhat is the answer?", "{answer}"),
    ]

In [12]:
for idx, template in enumerate(templates):
    print(f"\n# Instruction #{idx+1}")
    print('\n'.join(template)) 


# Instruction #1
Please answer this question: {question}
{answer}

# Instruction #2
{question}
{answer}

# Instruction #3
Write the answer: {question}
{answer}

# Instruction #4
What is the answer: {question}
{answer}

# Instruction #5
Answer this question.

{question}
{answer}

# Instruction #6
Answer the following question. {question}
{answer}

# Instruction #7
Question: {question}
Answer:
{answer}

# Instruction #8
{question}???
{answer}

# Instruction #9
Trivia question: {question}
And the answer is?
{answer}

# Instruction #10
{question}
What is the answer?
{answer}


In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load the custom dataset
dataset = load_dataset("yahoo_answers_qa")

prompt_template = """
Please answer this question: 

{question}

Answer:

{answer}
"""

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'nbestanswers', 'main_category'],
        num_rows: 87362
    })
})

In [15]:
def _row_to_instruction(row):
    row['instruction'] = prompt_template.format(question=row["question"], answer=row["answer"])
    return row
instruction_dataset = dataset.map(_row_to_instruction)

Map: 100%|██████████| 87362/87362 [00:08<00:00, 10144.59 examples/s]


In [20]:
print(instruction_dataset['train'][1]['instruction'])


Please answer this question: 

How to get rid of a beehive?

Answer:

Call an area apiarist.  They should be able to help you and would most likely remove them at no charge in exchange for the hive.  The bees have value and they now belong to you.



In [31]:
PIPELINE_TEMPLATE_FILE = ! dirname $(gcloud artifacts files list \
  --repository=google-cloud-registry \
  --project=ml-pipeline \
  --location=us \
  --package=t5-finetuning \
  --sort-by=~UPDATE_TIME \
  --format="value(FILE)" \
  --limit=1)
PIPELINE_TEMPLATE_FILE = PIPELINE_TEMPLATE_FILE[0]
PIPELINE_TEMPLATE_URI = f"https://us-kfp.pkg.dev/ml-pipeline/google-cloud-registry/{PIPELINE_TEMPLATE_FILE}"
PIPELINE_TEMPLATE_URI

'https://us-kfp.pkg.dev/ml-pipeline/google-cloud-registry/t5-finetuning/sha256:4bce8d9de89913a5d97ab803e8f0941c916a054a50539a749b00f3f0b0179a83'

https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pipeline_templates_t5x.ipynb

- Preparing dataset
    - [ ] Download dataset
    - [ ] Add instructions
    - [ ] Split to train and test
    - [ ] Convert dataset to TFRecords

- Tuning
    - [ ] 1. Model Garden & Pipeline Run
    - [ ] 2. Vertex AI Pipeline SDK
    - [ ] 3. Vertex Training Custom Job with HuggingFace 
    
- Evaluation
    - [ ] Prepare evaluation dataset
    - [ ] Define metrics
    - [ ] Run evaluation
    - [ ] Visualize results

## Bring your own tuning script

In [None]:
%%bash
pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score
pip install sentencepiece
pip install huggingface_hub
pip install evaluate[evaluator]

In [None]:
import nltk
import evaluate
import numpy as np

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

In [None]:
from datasets import load_dataset
from transformers import (T5ForConditionalGeneration, 
                          Seq2SeqTrainingArguments, Seq2SeqTrainer,
                          T5Tokenizer, DataCollatorForSeq2Seq)

# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Acquire the training data from Hugging Face
DATASET_NAME = "yahoo_answers_qa"
dataset = load_dataset(DATASET_NAME)
dataset = yahoo_answers_qa["train"].train_test_split(test_size=0.3)

# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Tokenize text to tokens
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
prompt_template = """Please answer this question: \n\n{question}\n\nAnswer:"""

def preprocess(rows):
    """Add instructions via prompt template, tokenize the text, and set the labels"""
    inputs = [prompt_template.format(question=q) for q in rows["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=rows["answer"], 
                       max_length=512,
                       truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# Define data collator to pad inputs and labels
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
)

trainer.train()

In [None]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# Acquire the training data from Hugging Face
DATA_NAME = "yahoo_answers_qa"
yahoo_answers_qa = load_dataset(DATA_NAME)
yahoo_answers_qa = yahoo_answers_qa["train"].train_test_split(test_size=0.3)

In [None]:
# Map the preprocessing function across our dataset
tokenized_dataset = yahoo_answers_qa.map(preprocess_function, batched=True)

In [None]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

In [None]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [None]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# Configure Vertex AI custom job with container image spec
job = aiplatform.CustomContainerTrainingJob(display_name=JOB_NAME,
                                            container_uri=TRAIN_IMAGE_URI)

# Submit the custom job to Vertex AI training service
model = job.run(replica_count=1,
                machine_type="n1-standard-8",
                accelerator_type="NVIDIA_TESLA_V100",
                accelerator_count=1,
                sync=True)



### Evaluation

In [1]:
from datasets import load_dataset

DATA_NAME = "yahoo_answers_qa"
dataset = load_dataset(DATA_NAME)
dataset = dataset["train"].train_test_split(test_size=0.3)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def _row_to_instruction(row):
    row['context'] = prompt_template.format(question=row["question"])
    return row
prompt_template = """Please answer this question: \n\n{question}\n\nAnswer:"""
eval_dataset = dataset["test"].map(_row_to_instruction)
eval_dataset = eval_dataset.remove_columns(["question", "main_category", "nbestanswers"])

Map: 100%|██████████| 26209/26209 [00:03<00:00, 7276.40 examples/s]


In [3]:
eval_dataset

Dataset({
    features: ['id', 'answer', 'context'],
    num_rows: 26209
})

In [4]:
from evaluate import evaluator
task_evaluator = evaluator("question-answering")

In [7]:
task_evaluator.compute?

[0;31mSignature:[0m
[0mtask_evaluator[0m[0;34m.[0m[0mcompute[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodel_or_pipeline[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mForwardRef[0m[0;34m([0m[0;34m'Pipeline'[0m[0;34m)[0m[0;34m,[0m [0mCallable[0m[0;34m,[0m [0mForwardRef[0m[0;34m([0m[0;34m'PreTrainedModel'[0m[0;34m)[0m[0;34m,[0m [0mForwardRef[0m[0;34m([0m[0;34m'TFPreTrainedModel'[0m[0;34m)[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mdatasets[0m[0;34m.[0m[0marrow_dataset[0m[0;34m.[0m[0mDataset[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msubset[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msplit[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline

MODEL_NAME = "google/flan-t5-base"
tokenizer=T5Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.cls_token = tokenizer.pad_token

eval_results = task_evaluator.compute(
    model_or_pipeline=MODEL_NAME,
    tokenizer=tokenizer,
    data=eval_dataset,
    id_column='id',
    question_column='context',
    label_column='answer',
    squad_v2_format=False
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
eval_results