In [1]:
#Run this if any import errors occur, since you need some updated stuff.
#! pip install datasets transformers

In [2]:
import transformers

print(transformers.__version__)
#At least 4.11.0 required!

4.36.2


In [3]:
#Set up model

model_checkpoint = "bert-base-uncased"
batch_size = 16

We must now load the dataset from our local JSON files. 

For refernece, see https://huggingface.co/docs/datasets/en/loading

In [4]:
from datasets import load_dataset, load_metric

#Load the dataset as train, validation, and test.
#We use the dev data as validation.

datasets = load_dataset("json", data_files={'train':'QQA Data/QQA_train.json', 
                                           'validation':'QQA Data/QQA_dev.json', 
                                           'test':'QQA Data/QQA_test.json'})

#Printing the dataset contents
print('Q: ' + datasets['train'][0]['question'])
print('O1: ' + datasets['train'][0]['Option1'])
print('O2: ' + datasets['train'][0]['Option2'])
print('A: ' + datasets['train'][0]['answer'])

Q: The ranger and the rustler both were riding horses that galloped at the same speed.  The rustler left at 01:00 where as the ranger left at 0500 hours. Who has traveled further?? 
O1: the ranger
O2: the rustler
A: Option 2


In [5]:
datasets

DatasetDict({
    train: Dataset({
        features: ['question_sci_10E', 'question', 'Option2', 'question_char', 'answer', 'Option1', 'question_mask', 'type', 'question_sci_10E_char'],
        num_rows: 564
    })
    validation: Dataset({
        features: ['question_sci_10E', 'question', 'Option2', 'question_char', 'answer', 'Option1', 'question_mask', 'type', 'question_sci_10E_char'],
        num_rows: 81
    })
    test: Dataset({
        features: ['question_sci_10E', 'question', 'Option2', 'question_char', 'answer', 'Option1', 'question_mask', 'type', 'question_sci_10E_char'],
        num_rows: 162
    })
})

The imported data is of the form:

{"question": "Jame's mother has a photo of Jane standing at a height of 14 inches, whereas a mountain appears to have height of 26 cm. It looks that way because? ", "Option1": "the mountain was farther away", "Option2": "Jane was farther away", "answer": "Option 2", "type": "Type_3", "question_sci_10E": "Jame's mother has a photo of Jane standing at a height of 1.4000000000E+01 inches, whereas a mountain appears to have height of 2.6000000000E+01 cm. It looks that way because? ", "question_char": "Jame's mother has a photo of Jane standing at a height of 1 4 inches, whereas a mountain appears to have height of 2 6 cm. It looks that way because? ", "question_sci_10E_char": "Jame's mother has a photo of Jane standing at a height of 1 . 4 0 0 0 0 0 0 0 0 0 E + 0 1 inches, whereas a mountain appears to have height of 2 . 6 0 0 0 0 0 0 0 0 0 E + 0 1 cm. It looks that way because? ", "question_mask": "Jame's mother has a photo of Jane standing at a height of [Num] inches, whereas a mountain appears to have height of [Num] cm. It looks that way because? "

For reference, see:
https://huggingface.co/docs/transformers/en/tasks/multiple_choice


In [6]:
#Let's get rid of the other columns
datasets = datasets.remove_columns(['question_char', 'question_sci_10E',
                         'question_sci_10E_char',
                         'question_mask', 'type',])

#We now only have a question, answer, and 2 options.

In [7]:
datasets = datasets.rename_column('answer', 'label')

For evaluation, see https://huggingface.co/spaces/evaluate-metric/accuracy

For fine-tuning, see: https://huggingface.co/docs/transformers/en/training#train-with-pytorch-trainer

For handing multiple choice, see https://huggingface.co/docs/transformers/en/tasks/multiple_choice


In [8]:
def set_labels(example):
    #print(example)
    example["label"] = int(example["label"][-1]) - 1
    return example

datasets = datasets.map(set_labels)

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/162 [00:00<?, ? examples/s]

In [9]:
datasets["train"][30]

{'question': 'A race car and a pickup both drove on the highway at the same speed. The driver of the race car got tired and parked after 29 mins, while the driver of the pickup ran for 43 mins. Which vehicle ultimately went the greater distance?? ',
 'Option2': 'pickup',
 'label': 1,
 'Option1': 'race car'}

In [10]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer("AMONG US!", "The impostor is SUS.")

{'input_ids': [101, 2426, 2149, 999, 102, 1996, 17727, 14122, 2953, 2003, 10514, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
ending_names = ["Option1", "Option2"]

def preprocess_function(examples):
    # Repeat each first sentence two times to go with the two possibilities of second sentences.
    first_sentences = [[context] * 2 for context in examples["question"]]
    # Grab all second sentences possible for each context.
    question_headers = examples["question"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

In [12]:
examples = datasets["train"][:5]
features = preprocess_function(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])

5 2 [89, 90]


In [13]:
idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(2)]

['[CLS] a tank weighs around 63 tons. a toy car weighs 1. 5 kg. because of this? [SEP] a tank weighs around 63 tons. a toy car weighs 1. 5 kg. because of this? the tank will speed up faster than the toy car [SEP]',
 '[CLS] a tank weighs around 63 tons. a toy car weighs 1. 5 kg. because of this? [SEP] a tank weighs around 63 tons. a toy car weighs 1. 5 kg. because of this? the toy car will speed up faster than the tank [SEP]']

In [14]:
encoded_datasets = datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/162 [00:00<?, ? examples/s]

In [15]:
print(encoded_datasets)

DatasetDict({
    train: Dataset({
        features: ['question', 'Option2', 'label', 'Option1', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 564
    })
    validation: Dataset({
        features: ['question', 'Option2', 'label', 'Option1', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 81
    })
    test: Dataset({
        features: ['question', 'Option2', 'label', 'Option1', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 162
    })
})


In [16]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-swag",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    #push_to_hub=True,
)

In [18]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [19]:
#Testing stuff.

accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(2)]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


['[CLS] the beauty queen glided across the marble floors with speed of 15 m / s but at speed 1000 cm / s on the wooden floors because it had? [SEP] the beauty queen glided across the marble floors with speed of 15 m / s but at speed 1000 cm / s on the wooden floors because it had? more resistance [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] the beauty queen glided across the marble floors with speed of 15 m / s but at speed 1000 cm / s on the wooden floors because it had? [SEP] the beauty queen glided across the marble floors with speed of 15 m / s but at speed 1000 cm / s on the wooden floors because it had? less resistance [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [20]:
import numpy as np

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)
trainer.train()

#EVERYTHING UP TO HERE WORKS FOR REAL. EVERYTHING BEYOND MIGHT BE... NOT SO GREAT. BE WARNED.
#Also, I haven't actually trained anything yet because of time. I've managed one epoch, but... eh.
# - Patrick

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.692849,0.506173


KeyboardInterrupt: 

In [None]:
import pandas as pd
from datasets import load_dataset
from evaluate import evaluator
from transformers import pipeline

models = [
    model_name
    #"xlm-roberta-large-finetuned-conll03-english",
    #"dbmdz/bert-large-cased-finetuned-conll03-english",
    #"elastic/distilbert-base-uncased-finetuned-conll03-english",
    #"dbmdz/electra-large-discriminator-finetuned-conll03-english",
    #"gunghio/distilbert-base-multilingual-cased-finetuned-conll2003-ner",
    #"philschmid/distilroberta-base-ner-conll2003",
    #"Jorgeutd/albert-base-v2-finetuned-ner",
]

data = datasets

task_evaluator = evaluator("token-classification")

results = []
for model in models:
    results.append(
        task_evaluator.compute(
            model_or_pipeline=model, data=data, metric="seqeval"
            )
        )

df = pd.DataFrame(results, index=models)
df[["overall_f1", "overall_accuracy", "total_time_in_seconds", "samples_per_second", "latency_in_seconds"]]