In [None]:
!pip install datasets evaluate rouge_score bert_score

In [35]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import evaluate

In [4]:
squad = load_dataset('squad')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [5]:
def highlight_answer(example):
    """
    Highlight the answer in the context of the given example.

    Parameters:
        - example (dict): A dictionary containing 'context' and 'answers' keys.

    Returns:
        - dict: A dictionary with a single key 'answer_highlighted_context',
        where the value is the context with the answer highlighted by '<h>' tags.

    Example:
    >>> example = {'context': 'The quick brown fox jumps over the lazy dog.', 
    ...            'answers': {'text': ['fox']}}
    >>> highlight_answer(example)
    {'answer_highlighted_context': 'The quick brown <h> fox <h> jumps over the lazy dog.'}
    """
    
    context = example['context']
    answer = example['answers']['text'][0]
    context_splits = context.split(answer)

    text = ""
    
    for split in context_splits:
        text += split
        text += ' <h> '
        text += answer
        text += ' <h> '
        text += split
    
    return {'answer_highlighted_context': text}

answer_highlighted_squad = squad.map(highlight_answer)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [6]:
def prepare_instruction_dataset(example):
    """
    Prepare an instruction dataset for a given example.

    Parameters:
        - example (dict): A dictionary containing an 'answer_highlighted_context' key.

    Returns:
        - dict: A dictionary with a single key 'instruction_prompt', where the value 
        is the instruction prompt string.

    Example:
    >>> example = {'answer_highlighted_context': 'The quick brown <h> fox <h> jumps over 
    ...            the lazy dog.'}
    >>> prepare_instruction_dataset(example)
    {'instruction_prompt': 'Generate a question whose answer is highlighted by <h> from\
    the context delimited by the triple backticks.\n    context:\n    ```\n    The quick\
    brown <h> fox <h> jumps over the lazy dog.\n    ```\n    '}
    """
    
    answer_highlighted_context = example['answer_highlighted_context']
    
    instruction_prompt = f"""Generate a question whose answer is highlighted by <h> from the context delimited by the triple backticks.
    context:
    ```
    {answer_highlighted_context}
    ```
    """
    
    return {'instruction_prompt': instruction_prompt}

instruction_squad = answer_highlighted_squad.map(prepare_instruction_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [15]:
tokenizer = AutoTokenizer.from_pretrained("mohammedaly2222002/t5-small-squad-qg")
model = AutoModelForSeq2SeqLM.from_pretrained("mohammedaly2222002/t5-small-squad-qg").to("cuda")

In [25]:
def generate_question(example, model):
    inputs = tokenizer(example['instruction_prompt'], return_tensors='pt', padding=True, truncation=True, max_length=512)

    outputs = model.generate(inputs['input_ids'].to('cuda'), max_length=128, num_beams=4)

    question = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return {'generated_question': question}

In [30]:
instruction_squad['validation'] = instruction_squad['validation'].map(generate_question, fn_kwargs={"model": model})

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [54]:
target_questions = squad['validation']['question']
generated_questions = instruction_squad['validation']['generated_question']

In [55]:
bleu = evaluate.load('bleu')
results = bleu.compute(predictions=generated_questions, references=target_questions)
print(results)

{'bleu': 0.16076186842709275, 'precisions': [0.46310251036696837, 0.20434204696499778, 0.12242430902796857, 0.07661796705004663], 'brevity_penalty': 0.9313753827852624, 'length_ratio': 0.9336258502834972, 'translation_length': 112135, 'reference_length': 120107}


In [56]:
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=generated_questions, references=target_questions)
print(results)

{'rouge1': 0.43145437393472863, 'rouge2': 0.22139439969184885, 'rougeL': 0.40099028960168104, 'rougeLsum': 0.4010703797959889}


In [57]:
meteor = evaluate.load('meteor')
results = meteor.compute(predictions=generated_questions, references=target_questions)
print(results)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{'meteor': 0.4024388534902158}


In [58]:
bertscore = evaluate.load('bertscore')
results = bertscore.compute(predictions=generated_questions, references=target_questions, lang="en")
print(results)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.8790370225906372, 0.8840280771255493, 0.8998529314994812, 0.891246497631073, 0.9014112949371338, 0.941583514213562, 0.8864713907241821, 0.8870112895965576, 0.941583514213562, 0.8708786964416504, 0.8938981890678406, 0.9051672220230103, 0.9011501669883728, 0.8906182050704956, 0.9172150492668152, 0.9116411209106445, 0.9105241298675537, 0.8906182050704956, 0.9003161191940308, 0.8728349804878235, 0.875133216381073, 0.9155318737030029, 0.8773268461227417, 0.8856765627861023, 0.9051672220230103, 0.9160614013671875, 0.8696074485778809, 0.9507763385772705, 0.8612164855003357, 0.8809319138526917, 0.9486746788024902, 0.9026265144348145, 0.8873003125190735, 0.9200379252433777, 0.8861453533172607, 0.9878382086753845, 0.8728570342063904, 0.8601921796798706, 0.9821103811264038, 0.9487829208374023, 0.8953713178634644, 0.9183803796768188, 0.9012290239334106, 0.8760125041007996, 0.9266268014907837, 0.8989025950431824, 0.9679564237594604, 0.9239066243171692, 0.8596397638320923, 0.9874985

In [59]:
print(sum(results['precision']) / len(results['precision']))

0.9122637625110521
