In [1]:
from datasets import load_dataset, ClassLabel, Metric
from evaluate import load
dataset = load_dataset("medmcqa")


  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|██████████| 5.35k/5.35k [00:00<00:00, 1.10MB/s]
Downloading metadata: 100%|██████████| 2.41k/2.41k [00:00<00:00, 992kB/s]
Downloading readme: 100%|██████████| 10.5k/10.5k [00:00<00:00, 3.58MB/s]


Downloading and preparing dataset medmcqa/default to /Users/ellington/.cache/huggingface/datasets/medmcqa/default/1.1.0/f2fdfa9ccfbf9d148c0639e6afe3379f3c7e95c4d52d5e68ec1156e5004bd880...


Downloading data: 100%|██████████| 55.3M/55.3M [00:03<00:00, 15.7MB/s]
                                                                                          

Dataset medmcqa downloaded and prepared to /Users/ellington/.cache/huggingface/datasets/medmcqa/default/1.1.0/f2fdfa9ccfbf9d148c0639e6afe3379f3c7e95c4d52d5e68ec1156e5004bd880. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 78.91it/s]


In [8]:
import pandas as pd
df = dataset['train'].to_pandas()

In [10]:
df.columns

Index(['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type',
       'exp', 'subject_name', 'topic_name'],
      dtype='object')

In [14]:
f"Original dataset of size {df.size} and {df[df.choice_type == 'single'].size} are of type single"

'Original dataset of size 2011042 and 1328415 are of type single'

In [15]:
filtered_dataset = dataset.filter(lambda x: x['choice_type'] == 'single')
filtered_dataset

                                                                         

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 120765
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 4134
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 2816
    })
})

In [28]:
features = filtered_dataset['train'].features.copy()
features['cop'] = ClassLabel(4, ['1','2','3','4'])
filtered_dataset = filtered_dataset.cast(features=features)
filtered_dataset = filtered_dataset.rename_column('cop', 'label')

                                                                                      

In [41]:
filtered_dataset['train'][0]

{'id': 'e9ad821a-c438-4965-9f77-760819dfa155',
 'question': 'Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma',
 'opa': 'Hyperplasia',
 'opb': 'Hyperophy',
 'opc': 'Atrophy',
 'opd': 'Dyplasia',
 'label': 2,
 'choice_type': 'single',
 'exp': 'Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950',
 'subject_name': 'Anatomy',
 'topic_name': 'Urinary tract'}

In [60]:
model_checkpoint = "bert-base-uncased"
batch_size = 16

In [34]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [48]:
answer_names = ["opa", "opb", "opc", "opd"]

def preprocess_function(examples):
    # Repeat each question four times to go with the four possible answers.
    questions = [[question] * 4 for question in examples["question"]]
    # Grab all answers possible for each question.
    answers = [[f"{examples[end][i]}" for end in answer_names] for i in range(len(examples['question']))]

    # Flatten everything
    questions = sum(questions, [])
    answers = sum(answers, [])
    
    # Tokenize
    tokenized_examples = tokenizer(questions, answers, truncation=True)
    # Un-flatten
    return {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

tokenized_dataset = filtered_dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at /Users/ellington/.cache/huggingface/datasets/medmcqa/default/1.1.0/f2fdfa9ccfbf9d148c0639e6afe3379f3c7e95c4d52d5e68ec1156e5004bd880/cache-3f80a93c8285dec4.arrow
Loading cached processed dataset at /Users/ellington/.cache/huggingface/datasets/medmcqa/default/1.1.0/f2fdfa9ccfbf9d148c0639e6afe3379f3c7e95c4d52d5e68ec1156e5004bd880/cache-e1a6b351a6197a0a.arrow
Loading cached processed dataset at /Users/ellington/.cache/huggingface/datasets/medmcqa/default/1.1.0/f2fdfa9ccfbf9d148c0639e6afe3379f3c7e95c4d52d5e68ec1156e5004bd880/cache-b554f5fedc0982b4.arrow


In [49]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'label', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120765
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'label', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4134
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'label', 'choice_type', 'exp', 'subject_name', 'topic_name', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2816
    })
})

In [56]:
idx = 3
tokenized_dataset['train']['question'][3]

'Scrub typhus is transmitted by: September 2004'

In [53]:
[tokenizer.decode(tokenized_dataset['train']["input_ids"][idx][i]) for i in range(4)]

['[CLS] scrub typhus is transmitted by : september 2004 [SEP] louse [SEP]',
 '[CLS] scrub typhus is transmitted by : september 2004 [SEP] tick [SEP]',
 '[CLS] scrub typhus is transmitted by : september 2004 [SEP] mite [SEP]',
 '[CLS] scrub typhus is transmitted by : september 2004 [SEP] milk [SEP]']

In [61]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, IntervalStrategy

model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-swag",
    evaluation_strategy = IntervalStrategy.EPOCH,
    do_eval=True,
    save_strategy = IntervalStrategy.EPOCH,
    logging_strategy=IntervalStrategy.EPOCH,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [62]:
# We need to tell our Trainer how to form batches from the pre-processed inputs.
# We haven't done any padding yet because we will pad each batch to the maximum length inside the batch 
# (instead of doing so with the maximum length of the whole dataset). 
# This will be the job of the data collator. 
# A data collator takes a list of examples and converts them to a batch (by, in our case, applying padding).
#  Since there is no data collator in the library that works on our specific problem, we will write one, adapted from the DataCollatorWithPadding:

from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [63]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in tokenized_dataset["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [64]:
[tokenizer.decode(batch["input_ids"][0][i].tolist()) for i in range(4)]

['[CLS] chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma [SEP] hyperplasia [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma [SEP] hyperophy [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma [SEP] atrophy [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] chronic urethral obstruction due to benign prismatic hyperplasia can le

In [None]:
import numpy as np

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()