Andrew Nakamoto and Edward Yeung \\
CSE 447: NLP \\
02/10/2024

This project is a very heavily modified version of https://huggingface.co/docs/transformers/en/tasks/multiple_choice

# Setup and Data

In [1]:
!pip install accelerate transformers datasets evaluate

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB

Load the datasets from mounted drive. For others using this notebook, you'll need to replace the filepaths specified here with the paths to the correct files from https://leaderboard.allenai.org/open_book_qa/submissions/get-started

In [2]:
from datasets import load_dataset
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

path_to_train = "/content/drive/MyDrive/NLP_FINAL_PROJECT/OpenBookQA-V1-Sep2018/Data/Main/train.jsonl"
path_to_dev = "/content/drive/MyDrive/NLP_FINAL_PROJECT/OpenBookQA-V1-Sep2018/Data/Main/dev.jsonl"
path_to_test = "/content/drive/MyDrive/NLP_FINAL_PROJECT/OpenBookQA-V1-Sep2018/Data/Main/test.jsonl"
data_files = {"train": path_to_train, "validation": path_to_dev, "test": path_to_test}

data = load_dataset("json", data_files=data_files)

Mounted at /content/drive/


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
# test the output
data["test"][0]

{'id': '8-343',
 'question': {'stem': 'A person wants to start saving money so that they can afford a nice vacation at the end of the year. After looking over their budget and expenses, they decide the best way to save money is to',
  'choices': [{'text': 'make more phone calls', 'label': 'A'},
   {'text': 'quit eating lunch out', 'label': 'B'},
   {'text': 'buy less with monopoly money', 'label': 'C'},
   {'text': 'have lunch with friends', 'label': 'D'}]},
 'answerKey': 'B'}

# Train model

Configure the model and the tokenizer

In [7]:
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMultipleChoice.from_pretrained(model_name)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenize the data

In [8]:
answer_key_to_numeric = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

def preprocess_function(examples):
    first_sentences = [[question['stem']] * 4 for question in examples['question']]
    second_sentences = [
        [f"{item['text']}" for item in question['choices']] for question in examples['question']
    ]

    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # setup the output from the tokenizer into groupings of 4
    d = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
    # add numeric labels to each of the items - needed for model
    d['label'] = [answer_key_to_numeric[c] for c in examples['answerKey']]
    return d

In [9]:
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
# test the output
tokenized_data["test"][0]

{'id': '8-343',
 'question': {'stem': 'A person wants to start saving money so that they can afford a nice vacation at the end of the year. After looking over their budget and expenses, they decide the best way to save money is to',
  'choices': [{'text': 'make more phone calls', 'label': 'A'},
   {'text': 'quit eating lunch out', 'label': 'B'},
   {'text': 'buy less with monopoly money', 'label': 'C'},
   {'text': 'have lunch with friends', 'label': 'D'}]},
 'answerKey': 'B',
 'input_ids': [[101,
   1037,
   2711,
   4122,
   2000,
   2707,
   7494,
   2769,
   2061,
   2008,
   2027,
   2064,
   8984,
   1037,
   3835,
   10885,
   2012,
   1996,
   2203,
   1997,
   1996,
   2095,
   1012,
   2044,
   2559,
   2058,
   2037,
   5166,
   1998,
   11727,
   1010,
   2027,
   5630,
   1996,
   2190,
   2126,
   2000,
   3828,
   2769,
   2003,
   2000,
   102,
   2191,
   2062,
   3042,
   4455,
   102],
  [101,
   1037,
   2711,
   4122,
   2000,
   2707,
   7494,
   2769,
   2061,
  

Adapt the DataCollatorWithPadding to create a batch of examples. It's more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [11]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

In [12]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

Setup evaluation

In [13]:
import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Set up the trainer. Requires HuggingFace write permission, so you need to set up an access token.

In [14]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
training_args = TrainingArguments(
    output_dir=f"openbookqa_{model_name}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
    optim="adamw_torch",
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

Train and evaluate the model

In [None]:
# don't forget to potentially reinitialize the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9722,1.043231,0.59
2,0.5587,1.175769,0.574
3,0.2498,1.442598,0.58
4,0.1171,1.739584,0.58
5,0.0657,1.909452,0.584


TrainOutput(global_step=775, training_loss=0.3926958502492597, metrics={'train_runtime': 696.7907, 'train_samples_per_second': 35.57, 'train_steps_per_second': 1.112, 'total_flos': 2073584275909920.0, 'train_loss': 0.3926958502492597, 'epoch': 5.0})

Save the model

In [None]:
path_to_save = f"/content/drive/MyDrive/NLP_FINAL_PROJECT/models/{trainer.args.output_dir}"
trainer.save_model(path_to_save)

events.out.tfevents.1708136485.8980104acd3d.251.1:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/asn1814/openbookqa_bert-base-uncased/commit/9ad252160844066f77c7de535f3827b3d234fd7e', commit_message='End of training', commit_description='', oid='9ad252160844066f77c7de535f3827b3d234fd7e', pr_url=None, pr_revision=None, pr_num=None)