In [1]:
!pip install datasets transformers==4.28.0 evaluate



In [2]:
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForMultipleChoice,
    Trainer,
    TrainingArguments,
    pipeline,
    LongformerForMultipleChoice,
    RobertaTokenizer,
    RobertaForMultipleChoice
    )
import evaluate
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df_train = pd.read_csv('drive/MyDrive/Data Science/NLP/LLM Science Exam/train.csv')
df_test = pd.read_csv('drive/MyDrive/Data Science/NLP/LLM Science Exam/test.csv')

raw_ds = Dataset.from_pandas(df_train)
raw_test_ds = Dataset.from_pandas(df_test)

split_ds = raw_ds.train_test_split(test_size=0.2)

In [5]:
# model_name = "bert-base-uncased"
# model_name = "roberta-base"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForMultipleChoice.from_pretrained(model_name)

# model_name = 'potsawee/longformer-large-4096-answering-race'

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = LongformerForMultipleChoice.from_pretrained(model_name)

model_name = "LIAMF-USP/roberta-large-finetuned-race"

tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForMultipleChoice.from_pretrained(model_name)

In [6]:
# Make four copies of the sent1 field and combine each of them with sent2 to recreate how a sentence starts.
# Combine sent2 with each of the four possible sentence endings.
# Flatten these two lists so you can tokenize them, and then unflatten them afterward so each example has a corresponding input_ids, attention_mask, and labels field

options = 'ABCDE'
option2label = {v: k for k, v in enumerate(options)}
label2option = {k: v for k, v in enumerate(options)}
max_seq_len = 328

def tokenize_dataset(sample):

    question_options = [[sample['prompt'], sample[o]] for o in options]

    # tokenize both lists
    tokenized_sents = tokenizer(
        question_options,
        truncation=True,
        max_length=max_seq_len,
        padding='max_length'
    )
    tokenized_sents['label'] = option2label[sample['answer']]
    return tokenized_sents


tokenized_ds = split_ds.map(
    tokenize_dataset,
    batched=False,
    remove_columns=raw_ds.column_names,
)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [7]:
# code from https://huggingface.co/docs/transformers/tasks/multiple_choice

from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [8]:
metric = evaluate.load('accuracy')

def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    preds = torch.argmax(torch.tensor(logits), axis=1)
    return metric.compute(predictions=preds, references=labels)

In [9]:
num_epochs = 3
train_batch_size = 4
eval_batch_size = 4
lr = 1e-5
wd = 0.01

data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)

training_args = TrainingArguments(
    'finetuned-bert-multiple-choice',
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=num_epochs,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    learning_rate=lr,
    weight_decay=wd,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.182283,0.475
2,No log,1.262424,0.5
3,No log,1.6163,0.55
4,No log,1.949589,0.525
5,No log,2.158407,0.525


TrainOutput(global_step=200, training_loss=0.8751486206054687, metrics={'train_runtime': 101.726, 'train_samples_per_second': 7.864, 'train_steps_per_second': 1.966, 'total_flos': 2388066049728000.0, 'train_loss': 0.8751486206054687, 'epoch': 5.0})

In [10]:
finetuned_model_name = 'finetuned-bert-multiple-choice'
trainer.save_model(finetuned_model_name)

tokenizer = AutoTokenizer.from_pretrained(finetuned_model_name)
model = AutoModelForMultipleChoice.from_pretrained(finetuned_model_name)

answer = ['A'] * len(raw_test_ds)
raw_test_ds = raw_test_ds.add_column('answer', answer)

tokenized_test_ds = raw_test_ds.map(
    tokenize_dataset,
    batched=False,
    remove_columns=raw_test_ds.column_names
)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [11]:
# for test_sample in tokenized_test_ds: -> using trainer
outputs = trainer.predict(tokenized_test_ds)

final_preds = []
preds = torch.tensor(outputs.predictions)
preds_argsorted = torch.argsort(-preds)[:, :3]
for answer in preds_argsorted.numpy():
    answers_options = [label2option[ans] for ans in answer]
    final_preds.append(' '.join(answers_options))

In [None]:
inputs = [
    tokenizer([[sample['prompt'], sample[o]] for o in options], return_tensors="pt", padding=True) for sample in raw_test_ds
    ]

final_preds = []
for test_sample in inputs:

    outputs = model(**{k: v.unsqueeze(0) for k, v in test_sample.items()})

    logits = outputs.logits
    preds_argsorted = torch.argsort(-logits)[:, :3]
    for answer in preds_argsorted.numpy():
        answers_options = [label2option[ans] for ans in answer]
        final_preds.append(' '.join(answers_options))


In [12]:
df_test['prediction'] = final_preds
df_test[['id', 'prediction']].to_csv('drive/MyDrive/Data Science/NLP/LLM Science Exam/roberta-submission.csv', index=False)