# Generate Data for Human Evaluation

This notebook serves as a guide for saving the detokenized output of the various models (including the initial attempt at fine-tuning `google/flan-t5`).

_Additional requirements: please see the TODOs._

In [None]:
# TODO: Point this to the local directory containing the unzipped 'preprocessed_sciq.zip' -> 'preprocessed_sciq-qg-256-new' directory hosted on the repo at "data/archive/preprocessed_sciq.zip".
SCIQ_PREPROCESSED_FLAN_T5 = None

In [None]:
## Import dependencies.
# OS tools.
import os
from pathlib import Path
# Data loaders.
from torch.utils.data import DataLoader
from datasets import load_from_disk, load_dataset
# Model loaders.
from transformers import AutoModelForSeq2SeqLM

## Helper class EvaluationConfigs

In [None]:
# Import dependencies.
import copy
import pandas as pd
from datasets import load_from_disk
from transformers import AutoTokenizer
import torch

### Class Definitions

In [None]:
# Contains the model names and arguments required for their evaluation
class EvaluationConfig:
    def __init__(self, model_name, tokenizer_name, input_length, output_length, data_loader, dataset_name, split, remove_columns, preprocess):
        self.model_name = model_name
        self.tokenizer_name = tokenizer_name
        self.input_length = input_length
        self.output_length = output_length
        self.data_loader = data_loader
        self.dataset_name = dataset_name
        self.split = split
        self.remove_columns = remove_columns
        self.preprocess = preprocess
        if self.data_loader == load_from_disk and self.remove_columns is not None:
            self.version = "alpha"
        elif self.data_loader == load_dataset:
            self.version = "beta"

    def __str__(self):
        attributes = self.__dict__.items()
        max_attr_name_length = max(len(attr_name) for attr_name in self.__dict__)
        attr_lines = [f"{attr_name.replace('_', ' ').capitalize().ljust(max_attr_name_length)}: {repr(value)}"
                        for attr_name, value in attributes]
        return '\n'.join(attr_lines)

### Function Definitions

In [None]:
# Returns a list of EvaluationConfig objects generated from the passed lists, element-wise.
def create_evaluation_configs(model_names, tokenizer_names, input_lengths, output_lengths, data_loaders, dataset_names, splits, columnar_removals, preprocessing_functions):
    configs = []
    for i in range(len(model_names)):
        config = EvaluationConfig(model_names[i], tokenizer_names[i], input_lengths[i], output_lengths[i],
                                  data_loaders[i], dataset_names[i], splits[i], columnar_removals[i], preprocessing_functions[i])
        configs.append(config)
    return configs

In [None]:
# Adapted from Rizki's notebook "QuestionGeneration.ipynb" -- current version hosted on the repo at "code/modular_approach/archive/fine_tuning_flan_t5/question_answer.ipynb".
def preprocess_q_gen_v_alpha(data_point, tokenizer, max_length_src, max_length_target):
    text = "{}<sep>{}".format(data_point['correct_answer'], data_point['context'])
    question = data_point['question']

    tokenized_inputs = tokenizer.encode_plus(text, max_length=max_length_src, padding='max_length', truncation=True, return_tensors="pt")
    tokenized_targets = tokenizer.encode_plus(question, max_length=max_length_target, padding='max_length', truncation=True, return_tensors="pt")

    input_ids = tokenized_inputs['input_ids'].squeeze()
    input_attention = tokenized_inputs['attention_mask'].squeeze()

    target_ids = tokenized_targets['input_ids'].squeeze()
    target_attention = tokenized_targets['attention_mask'].squeeze()

    labels = copy.deepcopy(target_ids)
    labels[labels == 0] = -100

    cleaned_data = {
        'input_ids':input_ids,
        'attention_mask': input_attention,
        'labels': labels
    }

    return cleaned_data

In [None]:
# Adapted from Rizki's notebook "DistractorGeneration.ipynb" -- current version hosted on the repo at "code/modular_approach/archive/fine_tuning_flan_t5/distractors.ipynb".
def preprocess_d_gen_v_alpha(data_point, tokenizer, max_length_src, max_length_target):
    text = "Generate distractors for question: {}, answer: {}, context: {}".format(data_point['question'], data_point['correct_answer'], data_point['context'])
    distractor = "Distractors: {}, {}, {}".format(data_point['distractor1'], data_point['distractor2'], data_point['distractor3'])

    tokenized_inputs = tokenizer(text, max_length=max_length_src, padding='max_length', truncation=True, return_tensors="pt")
    tokenized_targets = tokenizer(distractor, max_length=max_length_target, padding='max_length', truncation=True, return_tensors="pt")

    input_ids = tokenized_inputs['input_ids'].squeeze()
    input_attention = tokenized_inputs['attention_mask'].squeeze()

    target_ids = tokenized_targets['input_ids'].squeeze()
    target_attention = tokenized_targets['attention_mask'].squeeze()

    cleaned_data = {
        'input_ids':input_ids,
        'attention_mask': input_attention,
        'labels': target_ids
    }

    return cleaned_data

In [None]:
# Adapted from Rutger's "Single_QA_model_Rutger.py" -- current version hosted on the repo at "code/unified_approach/preprocessing_and_training/".
def preprocess_version_beta(dataset, tokenizer, max_length, batch_size):

    def tokenize_func(examples, max_length=max_length):
        inputs = tokenizer(examples['merged_column_input'], return_tensors="pt", max_length=max_length, truncation=True, padding='max_length')
        labels = tokenizer(examples['merged_column_output'], return_tensors="pt", max_length=max_length, truncation=True, padding='max_length')

        return {
            'input_ids': inputs['input_ids'],
            # 'attention_mask': None,
            # 'attention_mask': inputs['attention_mask'],
            'labels': labels['input_ids'],  # Labels for language modeling
        }

    # Add indices to keep track of original data.
    dataset = dataset.add_column('index', range(len(dataset)))

    support_labels =  ["Support: "]*len(dataset["correct_answer"])
    question_labels = ["Question: "]*len(dataset["correct_answer"])
    answer_labels = ["Answer: "]*len(dataset["correct_answer"])
    distractor1_labels = ["Distractor1: "]*len(dataset["correct_answer"])
    distractor2_labels = ["Distractor2: "]*len(dataset["correct_answer"])
    distractor3_labels = ["Distractor3: "]*len(dataset["correct_answer"])

    merged_column_input = [' '.join(row) for row in zip(support_labels, dataset["support"])]
    merged_column_output = [' '.join(row) for row in zip(question_labels, dataset["question"], answer_labels, dataset["correct_answer"], distractor1_labels, dataset["distractor1"],
                                                    distractor2_labels, dataset["distractor2"], distractor3_labels, dataset["distractor3"])]

    # Add the merged columns to the dataset
    dataset = dataset.add_column('merged_column_input', merged_column_input)
    dataset = dataset.add_column('merged_column_output', merged_column_output)

    # Filter the dataset to include only questions with supporting evidence for the correct answer (non-empty input).
    filtered_dataset = dataset.filter(lambda example: example['support'] is not None and example['support'] != "")
    # And include only questions with no superfluous information.
    filtered_dataset = filtered_dataset.filter(lambda example: len(example['question']) < 171)
    # And remove any datapoints which contain questions that have a 'fill-in-the-blank' type answer.
    filtered_dataset = filtered_dataset.filter(lambda example: '_______' not in example['question'] and '______' not in example['question'] and '_____' not in example['question']
                                                and '____' not in example['question'] and '___' not in example['question'])

    # Print the number of examples in the filtered dataset
    print(f"Number of examples with supporting evidence in {dataset}:", len(filtered_dataset))

    # return filtered_dataset.map(tokenize_func, batched=False)
    return filtered_dataset.map(tokenize_func, batched=True, batch_size=batch_size)

In [None]:
# HuggingFace model names.
model_names = ["rizkiduwinanto/question-generation",
               "rizkiduwinanto/distractor-generation",
               "rizkiduwinanto/final-bart-question-generation",
               "rizkiduwinanto/final-bart-distractor-generation",
               "b-b-brouwer/CL_base",
               "b-b-brouwer/CL_large"]

# Tokenizers corresponding to items in model_names.
tokenizer_names = ["google/flan-t5-small",
                  "google/flan-t5-small",
                  "facebook/bart-base",
                  "facebook/bart-base",
                  "facebook/bart-base",
                  "facebook/bart-large"]

# Corresponding input length (number of tokens).
input_lengths = [256,
                  512,
                  600,
                  600,
                  600,
                  600]

# Corresponding output length (number of tokens).
output_lengths = [256,
                  32,
                  1024,
                  1024,
                  1024,
                  1024]

# Corresponding data loader generators.
data_loaders = [load_from_disk,
                  load_from_disk,
                  load_dataset,
                  load_dataset,
                  load_dataset,
                  load_dataset]

# Corresponding input argument for data loaders.
dataset_names = [SCIQ_PREPROCESSED_FLAN_T5,
                 SCIQ_PREPROCESSED_FLAN_T5,
                 "allenai/sciq",
                 "allenai/sciq",
                 "allenai/sciq",
                 "allenai/sciq"]

# Corresponding splits used for evaluation.
# (the BART-based models were validated on the test split, so we use the validation split for evaluation.)
splits = ["test",
          "test",
          "validation",
          "validation",
          "validation",
          "validation"]

# Columns to remove in dataset.map(...)
columnar_removals = [
    ['question', 'distractor1', 'distractor2', 'distractor3', 'correct_answer', 'support', 'context'],
    ['question', 'distractor1', 'distractor2', 'distractor3', 'correct_answer', 'support', 'context'],
    # 'None' indicates that the preprocessing function itself handles this filtering.
    None,
    None,
    None,
    None]

['question', 'distractor1', 'distractor2', 'distractor3', 'correct_answer', 'support', 'context']

# Corresponding preprocessing functions.
preprocessing_functions = [
    preprocess_q_gen_v_alpha,
    preprocess_d_gen_v_alpha,
    preprocess_version_beta,
    preprocess_version_beta,
    preprocess_version_beta,
    preprocess_version_beta
]

## Inference parameters

In [None]:
eval_configs = create_evaluation_configs(model_names, tokenizer_names, input_lengths, output_lengths, data_loaders, dataset_names, splits, columnar_removals, preprocessing_functions)

In [None]:
for config in eval_configs:
    print(config)
    print("---------")

## Inference and pickling

In [None]:
# TODO: Adjust batch size according to your system capacity.
BATCH_SIZE = None

### Function Definitions

In [None]:
from tqdm import tqdm

# Decode the model inferences on their respective evaluation sets.
def generate_data_for_human_evaluation(c, batch_size=BATCH_SIZE):
    model = AutoModelForSeq2SeqLM.from_pretrained(c.model_name)
    if torch.cuda.is_available():
        model.cuda()
    tokenizer = AutoTokenizer.from_pretrained(c.tokenizer_name)

    if c.version == "alpha":
        preprocess = lambda x: c.preprocess(x, tokenizer, c.input_length, c.output_length)
        dataset = c.data_loader(c.dataset_name)[c.split]
        dataset_tokenized = dataset.map(preprocess, remove_columns=c.remove_columns)
        dataset_tokenized_batched = dataset_tokenized.map(batched=True, batch_size=batch_size)
    elif c.version == "beta":
        preprocess = lambda x: c.preprocess(x, tokenizer, c.input_length, batch_size=batch_size)
        dataset = c.data_loader(c.dataset_name, split=c.split)
        dataset_tokenized = preprocess(dataset)
        dataset_tokenized_batched = dataset_tokenized

    dataset_tokenized_batched.set_format('torch')
    dataset = dataset_tokenized_batched

    # Wraps tokenizer.decode.
    decode = lambda sequences: [tokenizer.decode(sequence, skip_special_tokens=True) for sequence in sequences]

    # Initialize lists of decoded inputs and outputs.
    inputs, outputs = [], []
    # original_index is only applicable to the data preprocessed from the the huggingface dataset directly.
    original_index = dataset_tokenized['index'] if 'index' in dataset_tokenized.column_names else None

    # Iterate over the dataset in batches.
    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i + batch_size]
        input_ids = batch['input_ids']
        input_ids = input_ids.to(model.device)
        attention_mask = batch['attention_mask'] if c.version == "alpha" else None

        # Generate predictions in batches.
        output_sequences = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=c.output_length)

        # Decode the generated sequences.
        decoded_inputs = decode(input_ids)
        decoded_outputs = decode(output_sequences)

        inputs.extend(decoded_inputs)
        outputs.extend(decoded_outputs)

    print('\n\n')

    key = lambda suffix: c.model_name + suffix
    return {'sciq_index': original_index, key('_input'): inputs, key('_output'): outputs}

In [None]:
# Make pandas DataFrames for each model, and pickle them.
def frame_results(eval_configs=eval_configs):
    jars_dir = Path("jars")
    jars_dir.mkdir(exist_ok=True)

    for c in eval_configs:
        print(c)
        print('-' * 100)

        result = generate_data_for_human_evaluation(c)
        print(result)
        df = pd.DataFrame(result)

        # Construct pickle filename.
        pickle_filename = c.model_name.replace("/", "_") + "_evaluation.pkl"
        pickle_path = jars_dir / pickle_filename

        # Save the DataFrame.
        df.to_pickle(pickle_path)

## Main code

In [None]:
# Run the this code to save model output DataFrames for evaluation as .pkl files in the (new) directory "jars" relative to the current working directory.
frame_results()