In [1]:
%pip install -q accelerate datasets evaluate numpy transformers torch sentencepiece wandb

## Imports

In [2]:
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
import numpy as np
import random
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5Tokenizer,
    T5ForConditionalGeneration
)
from typing import Any, Dict
import wandb

## Load Dataset

In [3]:
# Global constants
SEED = 42 # Random seed for reproducability
random.seed(SEED)

dataset = load_dataset("allenai/qasper")



  0%|          | 0/3 [00:00<?, ?it/s]

## Preprocessing Step 1 - Extraction of relevant texts

In [4]:
def get_formatted_dataset_from_split(dataset_split):
    abstracts = []
    questions = []
    answers = []

    # Iterate over all articles
    for article in dataset_split:
        qa = article['qas']

        # Iterate over all questions and answers
        for question, answer in zip(qa['question'], qa['answers']):
            unanswerable = False
            # Generate all answer candidates, from which we then randomly sample
            answer_candidates = []
            for question_answer in answer['answer']:
                # Additional check to skip unanswerable questions
                if question_answer['unanswerable']:
                    unanswerable = True

                answer = question_answer['free_form_answer'] if question_answer['free_form_answer'] else ' '.join(question_answer['extractive_spans'])
                answer_candidates.append(answer)

            # If a question is unanswerable, skip it as to not pollute the training data set
            if unanswerable or len(answer_candidates) == 0: continue

            # Finally add relevant objects to training data
            abstracts.append(article['abstract'])
            questions.append(question)
            answers.append(random.choice(answer_candidates)) # Randomly sample from available answers

    # DEBUG: Sanity check
    assert len(abstracts) == len(questions) == len(answers)

    return Dataset.from_dict({
        'abstract': abstracts,
        'question': questions,
        'answer': answers
    }).with_format("torch")

In [5]:
# Task 4 - Initial Preprocessing of qasper dataset
train_dataset = get_formatted_dataset_from_split(dataset['train'])
test_dataset = get_formatted_dataset_from_split(dataset['test'])

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['abstract', 'question', 'answer'],
    num_rows: 2315
})
Dataset({
    features: ['abstract', 'question', 'answer'],
    num_rows: 1208
})


In [6]:
# Task 5 - Get validation set from train
train_val_dataset = train_dataset.train_test_split(test_size=0.1, seed=SEED)

dataset = DatasetDict({
    'train': train_val_dataset["train"],
    'test': test_dataset,
    'val': train_val_dataset['test']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['abstract', 'question', 'answer'],
        num_rows: 2083
    })
    test: Dataset({
        features: ['abstract', 'question', 'answer'],
        num_rows: 1208
    })
    val: Dataset({
        features: ['abstract', 'question', 'answer'],
        num_rows: 232
    })
})

## Preprocessing Part 2 - Tokenization

In [7]:
# Task 6 - Preprocessing Function
tokenizer = T5Tokenizer.from_pretrained("google/t5-efficient-tiny")

def preprocess_function(sample: Dict[str, Any]):
    combined_qa = f"question: {{{sample['question']}}} context: {{{sample['abstract']}}}"
    return {'feature': tokenizer(
                            combined_qa,
                            truncation=True,
                            padding="max_length",
                            max_length=16
                        ),
            'labels':   tokenizer(
                            sample['answer'],
                            truncation=True,
                            padding="max_length",
                            max_length=8).input_ids
            }

In [8]:
# Task 7 - Apply preprocessing function using map
encoded_ds = dataset.map(
                preprocess_function,
                remove_columns=['abstract', 'question', 'answer']
             )

encoded_ds = encoded_ds.flatten() # Flatten such that input_ids and input mask come to the first level

# Rename colummns such that trainer knows what to do
encoded_ds = encoded_ds.rename_columns({
    'feature.attention_mask': 'attention_mask',
    'feature.input_ids':      'input_ids'
})
# encoded_ds = encoded_ds.remove_columns(['attention_mask'])

Map:   0%|          | 0/2083 [00:00<?, ? examples/s]

Map:   0%|          | 0/1208 [00:00<?, ? examples/s]

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

## Load Model

In [9]:
# Task 8 - load google/t5-efficient-tiny model with pre-trained weights
model = T5ForConditionalGeneration.from_pretrained("google/t5-efficient-tiny").to("cuda")

## Define TrainingArguments

In [10]:
# Task 9 - Define Seq2SeqTrainerArguments with learning rate scheduling and weight decay
num_epochs = 5
batch_size = 1
lr = 3e-4
weight_decay = 0.01
gradient_accumulation_steps = 2
lr_scheduler_type = "linear" # ['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup', 'inverse_sqrt', 'reduce_lr_on_plateau']

# Define Seq2SeqTrainerArguments
training_args = Seq2SeqTrainingArguments(
    output_dir = './checkpoints/',
    do_train=True,
    do_eval=True,

    fp16=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_epochs,
    learning_rate=lr,
    lr_scheduler_type=lr_scheduler_type,
    weight_decay=weight_decay,

    save_strategy="epoch",
    logging_strategy='epoch',
    evaluation_strategy="epoch",
    load_best_model_at_end=True,

    report_to="wandb"
)


## Evaluation Metric - BLEU

In [11]:
# TODO: Calculate BLEU Score as evaluation metric
bleu = evaluate.load('bleu')

def compute_metrics(eval_pred):
    batch_size = 1
    predictions, labels = eval_pred
    num_samples = len(predictions)
    num_batches = (num_samples + batch_size - 1) // batch_size  # Round up division

    total_bleu_score = 0.0

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size

        batch_predictions = predictions[start_idx:end_idx]
        batch_labels = labels[start_idx:end_idx]

        predictions_tokens = [tokenizer.decode(pred) for pred in batch_predictions]
        references_tokens = [tokenizer.decode(ref) for ref in batch_labels]

        batch_bleu_score = bleu.compute(predictions=predictions_tokens, references=references_tokens)
        total_bleu_score += batch_bleu_score

    average_bleu_score = total_bleu_score / num_batches
    return average_bleu_score


## Define Trainer

In [12]:
# Task 10 - Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_ds['train'],
    eval_dataset=encoded_ds['test'],
    compute_metrics=compute_metrics
)

## Train Model

In [None]:
# TODO: Register experiment at Weights & Biases  https://wandb.ai/site
wandb_project_name = "nlp-assignment5-task2"
run = wandb.init(project=wandb_project_name)

trainer.train()

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33myjojo17[0m ([33mnlp-assign5[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss


## Inference

In [None]:
# Task 11

# TODO: Choose a sample of the test set