In [14]:
%pip install -q accelerate datasets evaluate transformers torch sentencepiece

You should consider upgrading via the '/home/nano/projects/nlp_assignment_5/task2/.venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [15]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup,
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    T5Tokenizer, 
    T5ForConditionalGeneration
)
import random
import torch
from typing import Any, Dict

In [16]:
# Global constants
SEED = 42 # Random seed for reproducability
random.seed(SEED)

In [17]:
dataset = load_dataset("allenai/qasper")

Found cached dataset qasper (/home/nano/.cache/huggingface/datasets/allenai___qasper/qasper/0.3.0/2bfcd239e581ab83f9ab7b76a82e42c6bcf574a13246ae6cc5a6c357c35f96f9)


  0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
def get_formatted_dataset_from_split(dataset_split):
    abstracts = []
    questions = []
    answers = []
    
    # Iterate over all articles
    for article in dataset_split:
        qa = article['qas']

        # Iterate over all questions and answers
        for question, answer in zip(qa['question'], qa['answers']):
            unanswerable = False
            # Generate all answer candidates, from which we then randomly sample
            answer_candidates = []
            for question_answer in answer['answer']:   
                # Additional check to skip unanswerable questions
                if question_answer['unanswerable']:
                    unanswerable = True
                           
                answer = question_answer['free_form_answer'] if question_answer['free_form_answer'] else ' '.join(question_answer['extractive_spans'])
                answer_candidates.append(answer)

            # If a question is unanswerable, skip it as to not pollute the training data set
            if unanswerable or len(answer_candidates) == 0: continue
            
            # Finally add relevant objects to training data
            abstracts.append(article['abstract'])
            questions.append(question)
            answers.append(random.choice(answer_candidates)) # Randomly sample from available answers
    
    # DEBUG: Sanity check 
    assert len(abstracts) == len(questions) == len(answers)
    
    return Dataset.from_dict({
        'abstract': abstracts,
        'question': questions,
        'answer': answers
    }).with_format("torch")

In [19]:
# Task 4 - Initial Preprocessing of qasper dataset
train_dataset = get_formatted_dataset_from_split(dataset['train'])
test_dataset = get_formatted_dataset_from_split(dataset['test'])

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['abstract', 'question', 'answer'],
    num_rows: 2315
})
Dataset({
    features: ['abstract', 'question', 'answer'],
    num_rows: 1208
})


In [20]:
# Task 5 - Get validation set from train
train_val_dataset = train_dataset.train_test_split(test_size=0.1, seed=SEED)

dataset = DatasetDict({
    'train': train_val_dataset["train"],
    'test': test_dataset,
    'val': train_val_dataset['test']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['abstract', 'question', 'answer'],
        num_rows: 2083
    })
    test: Dataset({
        features: ['abstract', 'question', 'answer'],
        num_rows: 1208
    })
    val: Dataset({
        features: ['abstract', 'question', 'answer'],
        num_rows: 232
    })
})

In [21]:
# Task 6 - Preprocessing Function
tokenizer = T5Tokenizer.from_pretrained("google/t5-efficient-tiny")


def preprocess_function(sample: Dict[str, Any]):
    combined_qa = f"question: {{{sample['question']}}} context: {{{sample['abstract']}}}"
    return {'feature': tokenizer(combined_qa, 
                                 truncation=True, 
                                 padding="max_length", 
                                 max_length=128),
            'labels':   tokenizer(sample['answer'], 
                                  truncation=True, 
                                  padding="max_length", 
                                  max_length=32).input_ids}

In [22]:
# Task 7 - Apply preprocessing function using map
encoded_ds = dataset.map(preprocess_function,
                         remove_columns=['abstract', 'question', 'answer']).flatten()
encoded_ds = encoded_ds.flatten() # Flatten such that input_ids and input mask come to the first level
# Rename colummns such that trainer knows what to do
encoded_ds = encoded_ds.rename_columns({
    'feature.attention_mask': 'attention_mask',
    'feature.input_ids':      'input_ids'
})

Map:   0%|          | 0/2083 [00:00<?, ? examples/s]

Map:   0%|          | 0/1208 [00:00<?, ? examples/s]

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

In [23]:
# Task 8 - load google/t5-efficient-tiny model with pre-trained weights
model = T5ForConditionalGeneration.from_pretrained("google/t5-efficient-tiny")

In [24]:
# Task 9 - Define Seq2SeqTrainerArguments with learning rate scheduling and weight decay
# TODO: Learning Rate Scheduler, Weight Decay

# Define Optimizer - this takes care of lr scheduling
optimizer = AdamW(model.parameters(), 
                  lr=2e-5, 
                  weight_decay=0.01)

# Define Weight Decay
num_train_steps = len(train_dataset) // 5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

# Define Seq2SeqTrainerArguments
training_args = Seq2SeqTrainingArguments(output_dir = './checkpoints/',
                                  do_train=True,
                                  do_eval=True,
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  learning_rate=2e-5,
                                  evaluation_strategy="epoch",
                                  num_train_epochs=5,
                                  load_best_model_at_end=True,
                                  save_strategy="epoch")




In [25]:
# Task 10 - Trainer
# TODO: Register experiment at Weights & Biases  https://wandb.ai/site and make screenshots

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_ds['train'],
    eval_dataset=encoded_ds['test'],
    optimizers=(optimizer, scheduler)
)

In [26]:
trainer.train()

  0%|          | 0/10415 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Task 11