In [2]:
! pip install datasets transformers



In [4]:
import transformers

print(transformers.__version__)
#At least 4.11.0 required!

4.36.2


In [5]:
#Set up model

model_checkpoint = "bert-base-uncased"
batch_size = 16

We must now load the dataset from our local JSON files. 

For refernece, see https://huggingface.co/docs/datasets/en/loading

In [8]:
from datasets import load_dataset, load_metric

#Load the dataset as train, validation, and test.
#We use the dev data as validation.

datasets = load_dataset("json", data_files={'train':'QQA Data/QQA_train.json', 
                                           'validation':'QQA Data/QQA_dev.json', 
                                           'test':'QQA Data/QQA_test.json'})

#Printing the dataset contents
print('Q: ' + dataset['train'][0]['question'])
print('O1: ' + dataset['train'][0]['Option1'])
print('O2: ' + dataset['train'][0]['Option2'])
print('A: ' + dataset['train'][0]['answer'])

#Load evaluation metric
accuracy_metric = load("squad")

Q: The ranger and the rustler both were riding horses that galloped at the same speed.  The rustler left at 01:00 where as the ranger left at 0500 hours. Who has traveled further?? 
O1: the ranger
O2: the rustler
A: Option 2


In [9]:
datasets

DatasetDict({
    train: Dataset({
        features: ['question_sci_10E', 'question', 'Option2', 'question_char', 'answer', 'Option1', 'question_mask', 'type', 'question_sci_10E_char'],
        num_rows: 564
    })
    validation: Dataset({
        features: ['question_sci_10E', 'question', 'Option2', 'question_char', 'answer', 'Option1', 'question_mask', 'type', 'question_sci_10E_char'],
        num_rows: 81
    })
    test: Dataset({
        features: ['question_sci_10E', 'question', 'Option2', 'question_char', 'answer', 'Option1', 'question_mask', 'type', 'question_sci_10E_char'],
        num_rows: 162
    })
})

The imported data is of the form:

{"question": "Jame's mother has a photo of Jane standing at a height of 14 inches, whereas a mountain appears to have height of 26 cm. It looks that way because? ", "Option1": "the mountain was farther away", "Option2": "Jane was farther away", "answer": "Option 2", "type": "Type_3", "question_sci_10E": "Jame's mother has a photo of Jane standing at a height of 1.4000000000E+01 inches, whereas a mountain appears to have height of 2.6000000000E+01 cm. It looks that way because? ", "question_char": "Jame's mother has a photo of Jane standing at a height of 1 4 inches, whereas a mountain appears to have height of 2 6 cm. It looks that way because? ", "question_sci_10E_char": "Jame's mother has a photo of Jane standing at a height of 1 . 4 0 0 0 0 0 0 0 0 0 E + 0 1 inches, whereas a mountain appears to have height of 2 . 6 0 0 0 0 0 0 0 0 0 E + 0 1 cm. It looks that way because? ", "question_mask": "Jame's mother has a photo of Jane standing at a height of [Num] inches, whereas a mountain appears to have height of [Num] cm. It looks that way because? "

For reference, see:
https://huggingface.co/docs/transformers/en/tasks/multiple_choice


In [14]:
#Let's get rid of the other columns
datasets = datasets.remove_columns(['question_char', 'question_sci_10E',
                         'question_sci_10E_char',
                         'question_mask', 'type',])

#We now only have a question, answer, and 2 options.

DatasetDict({
    train: Dataset({
        features: ['question', 'Option2', 'answer', 'Option1'],
        num_rows: 564
    })
    validation: Dataset({
        features: ['question', 'Option2', 'answer', 'Option1'],
        num_rows: 81
    })
    test: Dataset({
        features: ['question', 'Option2', 'answer', 'Option1'],
        num_rows: 162
    })
})

For evaluation, see https://huggingface.co/spaces/evaluate-metric/accuracy

For fine-tuning, see: https://huggingface.co/docs/transformers/en/training#train-with-pytorch-trainer

For handing multiple choice, see https://huggingface.co/docs/transformers/en/tasks/multiple_choice


In [16]:
datasets["train"][30]

{'question': 'A race car and a pickup both drove on the highway at the same speed. The driver of the race car got tired and parked after 29 mins, while the driver of the pickup ran for 43 mins. Which vehicle ultimately went the greater distance?? ',
 'Option2': 'pickup',
 'answer': 'Option 2',
 'Option1': 'race car'}

In [19]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer("AMONG US!", "The impostor is SUS.")

{'input_ids': [101, 2426, 2149, 999, 102, 1996, 17727, 14122, 2953, 2003, 10514, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

max_seq_length = tokenizer.model_max_length

choice_list = ["Option1", "Option2"]

def preprocess_function(examples):
    # replicating first sentences 4 times
    first_sentences = []
    for q in examples['question']:
      for i in range(2):
        first_sentences.append(q)

    # putting all choices in a list
    second_sentences = []
    for choice in choice_list:
        second_sentences.append(choice)


    # Tokenize
    print(first_sentences)
    
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length)
    # Un-flatten
    dic = {'input_ids':[], 'token_type_ids':[], 'attention_mask':[]}

    for k, v in tokenized_examples.items():
        for i in range(0, len(v), 2):
            dic[k].append(v[i:i+2])

    return dic

In [48]:
ending_names = ["Option1", "Option2"]

def preprocess_function(examples):
    # Repeat each first sentence two times to go with the two possibilities of second sentences.
    first_sentences = [[context] * 2 for context in examples["question"]]
    # Grab all second sentences possible for each context.
    question_headers = examples["question"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

In [49]:
examples = datasets["train"][:5]
features = preprocess_function(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

#EVERYTHING UP TO HERE WORKS FOR REAL. EVERYTHING BEYOND IS... NOT SO GREAT. BE WARNED.

5 2 [89, 90]


In [52]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        #labels = [int(feature.pop(label_name)) for feature in features]
        labels = [1, 2]
        #Error checking
        print(labels)
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [53]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model_name = 'deepset/roberta-base-squad2'

model = AutoModelForMultipleChoice.from_pretrained(model_name)

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_validation,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


[1, 2]


IndexError: string index out of range

In [None]:
import numpy as np

import numpy as np
from sklearn.metrics import f1_score

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    print(predictions)
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
trainer = Trainer(
    model=model,
    #args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_validation,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
train_result = trainer.train()
trainer.save_model() # Saves the tokenizer too for easy upload

metrics = train_result.metrics


max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )

metrics["train_samples"] = min(max_train_samples, len(train_dataset))

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()