In [88]:
!pip install -q transformers datasets nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[31mERROR: Could not find a version that satisfies the requirement bleurt (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bleurt[0m[31m
[0m

In [89]:
!pip install -q git+https://github.com/google-research/bleurt.git

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[31mERROR: Could not find a version that satisfies the requirement tensorflow (from bleurt) (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow[0m[31m
[0m

In [42]:
!pip install -q rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [58]:
from typing import Tuple, Mapping
import pandas as pd
import torch
from transformers import AutoTokenizer
import datasets
from tqdm import tqdm
import nltk

class QGDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data: datasets.Dataset,
        max_length: int,
        pad_mask_id: int,
        tokenizer: AutoTokenizer
    ) -> None:
        self.data = pd.DataFrame(data)
        self.max_length = max_length
        self.pad_mask_id = pad_mask_id
        self.tokenizer = tokenizer

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index: int) -> Mapping[str, torch.Tensor]:
        item = self.data.loc[index]
        input_ids, attention_mask = self._encode_text(item.text)
        labels, _ = self._encode_text(item.question)
        masked_labels = self._mask_label_padding(labels)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": masked_labels
        }

    def _encode_text(self, text: str) -> Tuple[torch.Tensor, torch.Tensor]:
        encoded_text = self.tokenizer(
            text,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors='pt'
        )
        return (
            encoded_text["input_ids"].squeeze(),
            encoded_text["attention_mask"].squeeze()
        )

    def _mask_label_padding(self, labels: torch.Tensor) -> torch.Tensor:
        labels[labels == self.tokenizer.pad_token_id] = self.pad_mask_id
        return labels

In [90]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# assert device == torch.device('cuda'), "Not using CUDA. Set: Runtime > Change runtime type > Hardware Accelerator: GPU"

dataset = datasets.load_dataset("ehsanul007/IAmA-question-generator")
test_data = dataset["test"]
val_data = dataset["validation"]

Found cached dataset csv (/Users/ehsanulkabir/.cache/huggingface/datasets/ehsanul007___csv/ehsanul007--IAmA-question-generator-bf87111693de3cd9/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 3/3 [00:00<00:00, 1248.92it/s]


In [29]:
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
)
from typing import Any, List, Mapping, Tuple


class QuestionGenerator:

    def __init__(self) -> None:
        QG_PRETRAINED = "ehsanul007/IAmA-question-generator"
        self.TOPIC_TOKEN = "<topic>"
        self.CONTEXT_TOKEN = "<context>"
        self.SEQ_LENGTH = 512

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.qg_tokenizer = AutoTokenizer.from_pretrained(
            QG_PRETRAINED, use_fast=False)
        self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
        self.qg_model.to(self.device)
        self.qg_model.eval()

        self.question_ranker = QuestionRanker()

    def generate(
        self,
        qg_context: str,
        use_evaluator: bool = True,
        num_questions: bool = 1
    ) -> List:
        """Takes an article and generates a set of question and answer pairs. If use_evaluator
        is True then QA pairs will be ranked and filtered based on their quality. answer_style
        should selected from ["all", "sentences", "multiple_choice"].
        """

        generated_questions = self.generate_questions(qg_context)
        # generated_questions = self.generate_questions_from_inputs(qg_inputs)

        if use_evaluator:
            print("Evaluating Generated Questions...\n")
            encode_qc_pairs = self.question_ranker.encode_qc_pairs(
                generated_questions, qg_context
            )
            scores = self.question_ranker.get_scores(encode_qc_pairs)

            if num_questions:
                question_list = self._get_top_k_questions(
                    generated_questions, qg_context, scores, num_questions
                )
            else:
                question_list = self._get_top_k_questions(
                    generated_questions, qg_context, scores
                )

        else:
            question_list = self._get_all_questions(generated_questions)

        return question_list



    @torch.no_grad()
    def generate_questions(self, qg_input: str) -> str:
        """Takes qg_input which is the concatenated answer and context, and uses it to generate
        a question sentence. The generated question is decoded and then returned.
        """
        encoded_input = self._encode_qg_input(qg_input)
        # outputs = self.qg_model.generate(
        #     num_beams=20,
        #     do_sample=False,
        #     input_ids=encoded_input["input_ids"],
        #     max_length=50,
        #     num_return_sequences=5,
        #     diversity_penalty=1.0,
        #     num_beam_groups=10
        #     )
        outputs = self.qg_model.generate(
            input_ids=encoded_input["input_ids"],
            max_length=50
        )

        questions = [self.qg_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        return questions

    def _encode_qg_input(self, qg_input: str) -> torch.tensor:
        """Tokenizes a string and returns a tensor of input ids corresponding to indices of tokens in 
        the vocab.
        """
        return self.qg_tokenizer(
            qg_input,
            padding='max_length',
            max_length=self.SEQ_LENGTH,
            truncation=True,
            return_tensors="pt",
        ).to(self.device)

    def _get_top_k_questions(
        self, generated_questions: List[str], qg_context: List[str], scores, num_questions: int = 1
    ) -> List[Mapping[str, str]]:
        """Ranks generated questions according to scores, and returns the top num_questions examples.
        """
        question_list = []

        for i in range(num_questions):
            index = scores[i]
            qa = {
                "question": generated_questions[index]
            }
            question_list.append(qa)

        return question_list

    def _get_all_questions(self, generated_questions: List[str]):
        """Formats question and answer pairs without ranking or filtering."""
        question_list = []

        for question in generated_questions:
            qa = {
                "question": question
            }
            question_list.append(qa)

        return question_list


class QuestionRanker:
    """Wrapper for a transformer model which evaluates the quality of question-answer pairs.
    Given a QA pair, the model will generate a score. Scores can be used to rank and filter
    QA pairs.
    """

    def __init__(self) -> None:

        # QR_PRETRAINED = "iarfmoose/bert-base-cased-qa-evaluator"
        QR_PRETRAINED = "ehsanul007/IAmA-question-ranker"
        self.SEQ_LENGTH = 512

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.question_ranking_tokenizer = AutoTokenizer.from_pretrained(QR_PRETRAINED)
        self.question_ranking_model = AutoModelForSequenceClassification.from_pretrained(
            QR_PRETRAINED
        )
        self.question_ranking_model.to(self.device)
        self.question_ranking_model.eval()

    def encode_qc_pairs(self, questions: List[str], context: str) -> List[torch.tensor]:
        """Takes a list of questions and a list of context and encodes them as a list of tensors."""
        encoded_pairs = []

        for question in questions:
            encode_qc = self.question_ranking_tokenizer(
                text=question,
                text_pair=context,
                padding="max_length",
                max_length=self.SEQ_LENGTH,
                truncation=True,
                return_tensors="pt",
            )
            encoded_pairs.append(encode_qc.to(self.device))

        return encoded_pairs

    def get_scores(self, encode_qc_pairs: List[torch.tensor]) -> List[float]:
        """Generates scores for a list of encoded QA pairs."""
        scores = {}

        for i in range(len(encode_qc_pairs)):
            scores[i] = self._evaluate_question(encode_qc_pairs[i])

        # print(scores)

        return [
            k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)
        ]

    def _encode_qc(self, question: str, answer: str) -> torch.tensor:
        """Concatenates a question and answer, and then tokenizes them. Returns a tensor of 
        input ids corresponding to indices in the vocab.
        """
        return self.question_ranking_tokenizer(
            text=question,
            text_pair=answer,
            padding="max_length",
            max_length=self.SEQ_LENGTH,
            truncation=True,
            return_tensors="pt",
        )

    @torch.no_grad()
    def _evaluate_question(self, encode_qc_pair: torch.tensor) -> float:
        """Takes an encoded QA pair and returns a score."""
        output = self.question_ranking_model(**encode_qc_pair)
        return output[0][0][1]



In [30]:
qg = QuestionGenerator()

In [61]:
input[218]

'<topic>  Writing Process <context> I am Richard Ayoade, and Richard Ellef Ayoade ( EYE-oh-AH-dee; born 23 May 1977) is a British comedian, actor, broadcaster and filmmaker. He played the role of socially awkward IT technician Maurice Moss in Channel 4 sitcom The IT Crowd (20062013), for which he won the 2014 BAFTA for Best Male Comedy Performance.'

In [None]:
input = test_data["text"]
ground_truth = test_data["question"]

generated_questions = []
for text in tqdm(input):
    try:
        generated_questions.append(qg.generate_questions(text))
    except:
        generated_questions.append("")
        print(f"Error {text}")

In [110]:
# testing on validation set where the model is familiar with the context but not the topic
input = val_data["text"][:1000]
ground_truth = val_data["question"][:1000]

generated_questions = []
for text in tqdm(input):
    try:
        generated_questions.append(qg.generate_questions(text))
    except:
        generated_questions.append("")
        print(f"Error {text}")

100%|██████████| 1000/1000 [11:43<00:00,  1.42it/s]


In [111]:
error_indices = [i for i in range(len(input)) if input[i] == None]

# leave out the error indices from the input and ground truth
input = [input[i] for i in range(len(input)) if i not in error_indices]
ground_truth = [ground_truth[i] for i in range(len(ground_truth)) if i not in error_indices]

In [112]:
generated_questions = [question[0] for question in generated_questions if question != ""]

In [113]:
# create a dataframe to store input, ground truth and generated questions
df = pd.DataFrame(list(zip(input, ground_truth, generated_questions)), columns =['input', 'ground_truth', 'generated_questions'])

In [114]:
# df.to_csv('generated_questions_on_test_set.csv', index=False)
df.to_csv('generated_questions_on_val_set.csv', index=False)

In [115]:
# df = pd.read_csv('generated_questions_on_test_set.csv')
df = pd.read_csv('generated_questions_on_val_set.csv')

# reload ground truth and generated questions
ground_truth = df['ground_truth'].to_numpy().tolist()
generated_questions = df['generated_questions'].to_numpy().tolist()

In [116]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk

def bleu_score(ref_text, hyp_text, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1):
    # tokenize reference and hypothesis text
    ref_tokens = nltk.word_tokenize(ref_text.lower())
    hyp_tokens = nltk.word_tokenize(hyp_text.lower())

    # calculate BLEU score with smoothing and the specified weights
    return sentence_bleu([ref_tokens], hyp_tokens, weights=weights, smoothing_function=smoothing_function, auto_reweigh=True)


In [83]:
import nltk
nltk.download('wordnet')
from nltk.translate.meteor_score import meteor_score

def meteor(ref_text, hyp_text):
    # tokenize reference and hypothesis text
    ref_tokens = nltk.word_tokenize(ref_text.lower())
    hyp_tokens = nltk.word_tokenize(hyp_text.lower())

    # calculate METEOR score
    return meteor_score([ref_tokens], hyp_tokens)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ehsanulkabir/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [84]:
from rouge import Rouge

def rouge(ref_text, hyp_text):
    # initialize Rouge object
    rouge = Rouge()

    # calculate ROUGE score
    scores = rouge.get_scores(hyp_text, ref_text, avg=True)

    # extract F1 score
    f1_score = scores['rouge-1']['f']

    return f1_score

In [117]:
# calculate average BLEU score for 1-4 n-grams
for i in range(1, 5):
    weights = tuple((1.0 / i for _ in range(i)))
    bleu = np.mean([bleu_score(ref, hyp, weights=weights) for ref, hyp in zip(ground_truth, generated_questions)])
    print(f'BLEU-{i}: {bleu:.4f}')

BLEU-1: 0.1647
BLEU-2: 0.0626
BLEU-3: 0.0382
BLEU-4: 0.0282


In [118]:
# calculate average METEOR score
meteor_scores = [meteor(ground_truth[i], generated_questions[i]) for i in range(len(ground_truth))]
avg_meteor_score = sum(meteor_scores) / len(meteor_scores)
print("Average METEOR score is {:.4f}".format(avg_meteor_score))

Average METEOR score is 0.1568


In [119]:
# calculate average ROUGE score
rouge_scores = [rouge(ground_truth[i], generated_questions[i]) for i in range(len(ground_truth))]
avg_rouge_score = sum(rouge_scores) / len(rouge_scores)
print("Average ROUGE score is {:.4f}".format(avg_rouge_score))

Average ROUGE score is 0.1357
