# 0.0 Install Required Packages

In [9]:
!pip install --upgrade --quiet datasets transformers accelerate fsspec gcsfs

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


# 1.0 Import Required Packages

In [12]:
from tqdm import tqdm
import re

from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)

from datasets import load_dataset

import evaluate

# 2.0 Dataset Preparation

## 2.1 Load SQuAD Dataset
The Stanford Question Answering Dataset (SQuAD) is a popular benchmark dataset in the field of natural language processing (NLP) and machine reading comprehension. It was developed by researchers at Stanford University. SQuAD consists of a large collection of real questions posed by crowdworkers on a set of Wikipedia articles, where each question is paired with a corresponding passage from the article, and the answer to each question is a segment of text from the corresponding passage.

The goal of SQuAD is to train and evaluate machine learning models to understand and answer questions posed in natural language. It has been widely used as a benchmark for evaluating the performance of various question answering systems and models, including both rule-based systems and deep learning-based approaches such as neural network models.

In [13]:
squad = load_dataset('squad')
squad



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

## 2.2 A Look at the Dataset

In [4]:
squad['train'].features

{'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}

In [5]:
example = squad['train'][900]

for k, v in example.items():
    print(f'{k}: {v}\n')

id: 56becc903aeaaa14008c94a0

title: Beyoncé

context: Beyoncé has won 20 Grammy Awards, both as a solo artist and member of Destiny's Child, making her the second most honored female artist by the Grammys, behind Alison Krauss and the most nominated woman in Grammy Award history with 52 nominations. "Single Ladies (Put a Ring on It)" won Song of the Year in 2010 while "Say My Name" and "Crazy in Love" had previously won Best R&B Song. Dangerously in Love, B'Day and I Am... Sasha Fierce have all won Best Contemporary R&B Album. Beyoncé set the record for the most Grammy awards won by a female artist in one night in 2010 when she won six awards, breaking the tie she previously held with Alicia Keys, Norah Jones, Alison Krauss, and Amy Winehouse, with Adele equaling this in 2012. Following her role in Dreamgirls she was nominated for Best Original Song for "Listen" and Best Actress at the Golden Globe Awards, and Outstanding Actress in a Motion Picture at the NAACP Image Awards. Beyoncé 

## 2.3 Highlight Answers in `context`
I followed Chan and Fan (2019) by introducing highlight token `<h>` to take into account an answer `a` within context `c` as below:

$x = [c_1, ..., <h>, a_1, ..., a_a, <h>, ..., c_c]$


In [6]:
def highlight_answer(example):
    """
    Highlight the answer in the context of the given example.

    Parameters:
        - example (dict): A dictionary containing 'context' and 'answers' keys.

    Returns:
        - dict: A dictionary with a single key 'answer_highlighted_context',
        where the value is the context with the answer highlighted by '<h>' tags.

    Example:
    >>> example = {'context': 'The quick brown fox jumps over the lazy dog.',
    ...            'answers': {'text': ['fox']}}
    >>> highlight_answer(example)
    {'answer_highlighted_context': 'The quick brown <h> fox <h> jumps over the lazy dog.'}
    """

    context = example['context']
    answer = example['answers']['text'][0]
    context_splits = context.split(answer)

    text = ""

    for split in context_splits:
        text += split
        text += ' <h> '
        text += answer
        text += ' <h> '
        text += split

    return {'answer_highlighted_context': text}

In [7]:
answer_highlighted_squad = squad.map(highlight_answer)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
example = answer_highlighted_squad['train'][900]

for k, v in example.items():
    print(f'{k}: {v}\n')

id: 56becc903aeaaa14008c94a0

title: Beyoncé

context: Beyoncé has won 20 Grammy Awards, both as a solo artist and member of Destiny's Child, making her the second most honored female artist by the Grammys, behind Alison Krauss and the most nominated woman in Grammy Award history with 52 nominations. "Single Ladies (Put a Ring on It)" won Song of the Year in 2010 while "Say My Name" and "Crazy in Love" had previously won Best R&B Song. Dangerously in Love, B'Day and I Am... Sasha Fierce have all won Best Contemporary R&B Album. Beyoncé set the record for the most Grammy awards won by a female artist in one night in 2010 when she won six awards, breaking the tie she previously held with Alicia Keys, Norah Jones, Alison Krauss, and Amy Winehouse, with Adele equaling this in 2012. Following her role in Dreamgirls she was nominated for Best Original Song for "Listen" and Best Actress at the Golden Globe Awards, and Outstanding Actress in a Motion Picture at the NAACP Image Awards. Beyoncé 

## 2.4 Create Instruction Promot

In [15]:
def prepare_instruction_dataset(example):
    """
    Prepare an instruction dataset for a given example.

    Parameters:
        - example (dict): A dictionary containing an 'answer_highlighted_context' key.

    Returns:
        - dict: A dictionary with a single key 'instruction_prompt', where the value
        is the instruction prompt string.

    Example:
    >>> example = {'answer_highlighted_context': 'The quick brown <h> fox <h> jumps over
    ...            the lazy dog.'}
    >>> prepare_instruction_dataset(example)
    {'instruction_prompt': 'Generate a question whose answer is highlighted by <h> from\
    the context delimited by the triple backticks.\n    context:\n    ```\n    The quick\
    brown <h> fox <h> jumps over the lazy dog.\n    ```\n    '}
    """

    answer_highlighted_context = example['answer_highlighted_context']

    instruction_prompt = f"""Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.
    context:
    ```
    {answer_highlighted_context}
    ```
    """

    return {'instruction_prompt': instruction_prompt}

In [16]:
instruction_squad = answer_highlighted_squad.map(prepare_instruction_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [17]:
example = instruction_squad['train'][900]

for k, v in example.items():
    print(f'{k}: {v}\n')

id: 56becc903aeaaa14008c94a0

title: Beyoncé

context: Beyoncé has won 20 Grammy Awards, both as a solo artist and member of Destiny's Child, making her the second most honored female artist by the Grammys, behind Alison Krauss and the most nominated woman in Grammy Award history with 52 nominations. "Single Ladies (Put a Ring on It)" won Song of the Year in 2010 while "Say My Name" and "Crazy in Love" had previously won Best R&B Song. Dangerously in Love, B'Day and I Am... Sasha Fierce have all won Best Contemporary R&B Album. Beyoncé set the record for the most Grammy awards won by a female artist in one night in 2010 when she won six awards, breaking the tie she previously held with Alicia Keys, Norah Jones, Alison Krauss, and Amy Winehouse, with Adele equaling this in 2012. Following her role in Dreamgirls she was nominated for Best Original Song for "Listen" and Best Actress at the Golden Globe Awards, and Outstanding Actress in a Motion Picture at the NAACP Image Awards. Beyoncé 

## 2.5 Tokenize the Dataset

In [18]:
model_ckpt = 't5-small'
tokenizer = T5TokenizerFast.from_pretrained(model_ckpt)
model = T5ForConditionalGeneration.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [19]:
def tokenize_dataset(batch):
    """
    Tokenize a batch of data for a model.

    Parameters:
        - batch (dict): A dictionary containing 'instruction_prompt' and 'question' keys.

    Returns:
        - dict: A dictionary containing the tokenized inputs and labels, ready to be used as input to a model.
    """

    model_inputs = tokenizer(batch['instruction_prompt'], max_length=512, truncation=True, padding=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch['question'], max_length=128, truncation=True, padding=True)

    # Make sure that the labels have the same shape as the inputs
    labels['input_ids'] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
    ]
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [20]:
tokenized_squad = instruction_squad.map(tokenize_dataset, batched=True, remove_columns=squad['train'].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]



Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['answer_highlighted_context', 'instruction_prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['answer_highlighted_context', 'instruction_prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10570
    })
})

# 3.0 Model Training

## 3.1 Configure HyperParameters

In [24]:
training_args = TrainingArguments(
    output_dir='t5-small-squad-qg-v2',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    eval_strategy='steps',
    learning_rate=5e-5,
    weight_decay=0.01,
    adam_epsilon=1e-6,
    warmup_steps=1000,  # Increased warmup steps to help learning rate increase more smoothly
    logging_dir='./logs',
    logging_steps=500,
    save_steps=1000,
    save_total_limit=2,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False
)

## 3.2 Define `DataCollatorForSeq2Seq`

In [25]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## 3.3 Define the `Trainer` API

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad['train'],
    eval_dataset=tokenized_squad['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(


## 3.4 Start Fine-Tuning

In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkvchambit[0m ([33mkvchambit-universidad-nacional-de-san-agustin-de-arequipa[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,2.7444,2.017537
1000,2.1363,1.84559
1500,1.9863,1.782036
2000,1.9272,1.751432
2500,1.877,1.72736
3000,1.8486,1.713918
3500,1.8222,1.699327
4000,1.8081,1.691688
4500,1.7756,1.68357
5000,1.7684,1.678957


## 3.5 Pushing the Model to the Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
trainer.push_to_hub('Commit Successfully!')

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mohammedaly2222002/t5-small-squad-qg-v2/commit/5170a2eb523f239bab80f62eb542713d4b3ba4fe', commit_message='Commit Successfully!', commit_description='', oid='5170a2eb523f239bab80f62eb542713d4b3ba4fe', pr_url=None, pr_revision=None, pr_num=None)

# 4.0 Using the Model in Inference

In [None]:
def highlight_answer(context, answer):
    """
    Highlight the answer in the given context.

    Parameters:
        - context (str): The context in which the answer is found.
        - answer (str): The answer to be highlighted.

    Returns:
        - str: The context with the answer highlighted by '<h>' tags.

    Example:
    >>> context = 'The quick brown fox jumps over the lazy dog.'
    >>> answer = 'fox'
    >>> highlight_answer(context, answer)
    'The quick brown <h> fox <h> jumps over the lazy dog.'
    """

    context_splits = context.split(answer)

    text = ""
    for split in context_splits:
        text += split
        text += ' <h> '
        text += answer
        text += ' <h> '
        text += split

    return text


def prepare_instruction(answer_highlighted_context):
    """
    Prepare an instruction prompt for generating a question.

    Parameters:
        - answer_highlighted_context (str): The context with the answer highlighted by '<h>' tags.

    Returns:
        - str: The instruction prompt string.

    Example:
    >>> answer_highlighted_context = 'The quick brown <h> fox <h> jumps over the lazy dog.'
    >>> prepare_instruction(answer_highlighted_context)
    'Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.\\n    context:\\n    ```\\n    The quick brown <h> fox <h> jumps over the lazy dog.\\n    ```\\n    '
    """

    instruction_prompt = f"""Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.
    context:
    ```
    {answer_highlighted_context}
    ```
    """

    return instruction_prompt

In [None]:
from transformers import pipeline

pipe = pipeline('text2text-generation', model='mohammedaly2222002/t5-small-squad-qg-v2', device_map='auto')

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

In [None]:
context = """During the 2011–12 season, he set the La Liga and European records\
for most goals scored in a single season, while establishing himself as Barcelona's\
all-time top scorer. The following two seasons, Messi finished second for the Ballon\
d'Or behind Cristiano Ronaldo (his perceived career rival), before regaining his best\
form during the 2014–15 campaign, becoming the all-time top scorer in La Liga and \
leading Barcelona to a historic second treble, after which he was awarded a fifth \
Ballon d'Or in 2015. Messi assumed captaincy of Barcelona in 2018, and won a record \
sixth Ballon d'Or in 2019. Out of contract, he signed for French club Paris Saint-Germain\
in August 2021, spending two seasons at the club and winning Ligue 1 twice. Messi \
joined American club Inter Miami in July 2023, winning the Leagues Cup in August of that year.
"""

answer_highlighted_context = highlight_answer(context=context, answer='2015')
prompt = prepare_instruction(answer_highlighted_context)

print(prompt)

Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.
    context:
    ```
    During the 2011–12 season, he set the La Liga and European recordsfor most goals scored in a single season, while establishing himself as Barcelona'sall-time top scorer. The following two seasons, Messi finished second for the Ballond'Or behind Cristiano Ronaldo (his perceived career rival), before regaining his bestform during the 2014–15 campaign, becoming the all-time top scorer in La Liga and leading Barcelona to a historic second treble, after which he was awarded a fifth Ballon d'Or in  <h> 2015 <h> During the 2011–12 season, he set the La Liga and European recordsfor most goals scored in a single season, while establishing himself as Barcelona'sall-time top scorer. The following two seasons, Messi finished second for the Ballond'Or behind Cristiano Ronaldo (his perceived career rival), before regaining his bestform during the 2014–15 campaign, becom

In [None]:
outputs = pipe(prompt, num_return_sequences=3, num_beams=5, num_beam_groups=5, diversity_penalty=1.0)
for output in outputs:
    print(output['generated_text'])



In what year was Messi awarded a fifth Ballon d'Or?
When was he awarded a fifth Ballon d'Or?
What year was Messi awarded a fifth Ballon d'Or?
