In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
# from datasets import Dataset
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import json
import json_lines
import os
from tqdm import tqdm
from collections import defaultdict

In [20]:
class MultipleChoiceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [21]:
def get_riddlesense_prompt(question, options):
    prompt = \
"""
Question: {}

What is the correct answer to the question from the following choices?
Options: 
(A): {}
(B): {}
(C): {}
(D): {}
(E): {}""".format(question, options[0], options[1], options[2], options[3], options[4])
    return prompt

In [None]:
def load_data(file_path):
    data = []
    with open(file_path, 'rb') as f: 
        for item in json_lines.reader(f):
            data.append(item)

    processed_data = []
    for item in data:
        question = item['question']['stem']
        options = [_['text'] for _ in item['question']['choices']]
        answer = item['answerKey']
        text = get_riddlesense_prompt(question, options)

        model_inputs = tokenizer(
            text,
            truncation=True,
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
        )

        # labels = tokenizer(answer, max_length=2, padding="max_length", truncation=True, return_tensors="pt")
        labels = tokenizer(answer, max_length=2, padding="max_length", truncation=True)
        labels = labels["input_ids"]
        labels = [l if l != tokenizer.pad_token_id else -100 for l in labels]
        model_inputs["labels"] = labels
        processed_data.append(model_inputs)

    return processed_data

In [124]:
model_name = 'google/flan-t5-small'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [130]:
train_data = load_data("data/rs_train.jsonl")
valid_data = load_data("data/rs_dev.jsonl")

In [131]:
train_dataset = MultipleChoiceDataset(train_data)
valid_dataset = MultipleChoiceDataset(valid_data)

batch_size = 4

train_loader = DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, shuffle=False, collate_fn=default_data_collator, batch_size=batch_size)

In [147]:
# hyperparameters
lr = 1e-3
num_epochs = 10
batch_size = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [148]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_loader) * num_epochs),
)

In [149]:
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_loader)):
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    true_preds = []
    for step, batch in enumerate(tqdm(valid_loader)):
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)
        with torch.no_grad():
            outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )
        true_preds.extend(
            tokenizer.batch_decode(batch["labels"].detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(valid_loader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_loader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

# model.save_pretrained(f'{model_name[7:]}_finetuned')

100%|██████████| 878/878 [00:53<00:00, 16.56it/s]
100%|██████████| 256/256 [00:04<00:00, 58.95it/s]


epoch=0: train_ppl=tensor(2.2439, device='cuda:0') train_epoch_loss=tensor(0.8082, device='cuda:0') eval_ppl=tensor(2.2364, device='cuda:0') eval_epoch_loss=tensor(0.8049, device='cuda:0')


100%|██████████| 878/878 [00:53<00:00, 16.37it/s]
100%|██████████| 256/256 [00:04<00:00, 59.34it/s]


epoch=1: train_ppl=tensor(2.2439, device='cuda:0') train_epoch_loss=tensor(0.8082, device='cuda:0') eval_ppl=tensor(2.2391, device='cuda:0') eval_epoch_loss=tensor(0.8061, device='cuda:0')


100%|██████████| 878/878 [00:53<00:00, 16.48it/s]
100%|██████████| 256/256 [00:04<00:00, 59.63it/s]


epoch=2: train_ppl=tensor(2.2441, device='cuda:0') train_epoch_loss=tensor(0.8083, device='cuda:0') eval_ppl=tensor(2.2369, device='cuda:0') eval_epoch_loss=tensor(0.8051, device='cuda:0')


100%|██████████| 878/878 [00:53<00:00, 16.53it/s]
100%|██████████| 256/256 [00:04<00:00, 59.79it/s]


epoch=3: train_ppl=tensor(2.2419, device='cuda:0') train_epoch_loss=tensor(0.8073, device='cuda:0') eval_ppl=tensor(2.2392, device='cuda:0') eval_epoch_loss=tensor(0.8061, device='cuda:0')


100%|██████████| 878/878 [00:53<00:00, 16.53it/s]
100%|██████████| 256/256 [00:04<00:00, 59.93it/s]


epoch=4: train_ppl=tensor(2.2463, device='cuda:0') train_epoch_loss=tensor(0.8093, device='cuda:0') eval_ppl=tensor(2.2404, device='cuda:0') eval_epoch_loss=tensor(0.8067, device='cuda:0')


100%|██████████| 878/878 [00:53<00:00, 16.50it/s]
100%|██████████| 256/256 [00:04<00:00, 60.07it/s]


epoch=5: train_ppl=tensor(2.2386, device='cuda:0') train_epoch_loss=tensor(0.8059, device='cuda:0') eval_ppl=tensor(2.2424, device='cuda:0') eval_epoch_loss=tensor(0.8076, device='cuda:0')


100%|██████████| 878/878 [00:53<00:00, 16.49it/s]
100%|██████████| 256/256 [00:04<00:00, 59.88it/s]


epoch=6: train_ppl=tensor(2.2403, device='cuda:0') train_epoch_loss=tensor(0.8066, device='cuda:0') eval_ppl=tensor(2.2370, device='cuda:0') eval_epoch_loss=tensor(0.8051, device='cuda:0')


100%|██████████| 878/878 [00:53<00:00, 16.32it/s]
100%|██████████| 256/256 [00:04<00:00, 60.16it/s]


epoch=7: train_ppl=tensor(2.2388, device='cuda:0') train_epoch_loss=tensor(0.8060, device='cuda:0') eval_ppl=tensor(2.2372, device='cuda:0') eval_epoch_loss=tensor(0.8052, device='cuda:0')


100%|██████████| 878/878 [00:53<00:00, 16.54it/s]
100%|██████████| 256/256 [00:04<00:00, 60.26it/s]


epoch=8: train_ppl=tensor(2.2391, device='cuda:0') train_epoch_loss=tensor(0.8061, device='cuda:0') eval_ppl=tensor(2.2363, device='cuda:0') eval_epoch_loss=tensor(0.8048, device='cuda:0')


100%|██████████| 878/878 [00:53<00:00, 16.55it/s]
100%|██████████| 256/256 [00:04<00:00, 60.18it/s]

epoch=9: train_ppl=tensor(2.2378, device='cuda:0') train_epoch_loss=tensor(0.8055, device='cuda:0') eval_ppl=tensor(2.2369, device='cuda:0') eval_epoch_loss=tensor(0.8051, device='cuda:0')





In [152]:
correct = 0
total = 0
for pred, true in zip(eval_preds, true_preds):
    if pred.strip() == true.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{true_preds[:10]=}")

accuracy=20.372184133202744 % on the evaluation dataset
eval_preds[:10]=['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D']
true_preds[:10]=['B', 'E', 'A', 'D', 'B', 'D', 'E', 'A', 'A', 'C']


In [None]:
from huggingface_hub import notebook_login
notebook_login()
# model.push_to_hub("devanshrj/t5-large_PREFIX_TUNING_SEQ2SEQ", use_auth_token=True)

## Inference

In [60]:
def get_brainteaser_prompt(question, options):
    prompt = \
"""
Question: {}

What is the correct answer to the question from the following choices?
Options: 
(A): {}
(B): {}
(C): {}
(D): {}""".format(question, options[0], options[1], options[2], options[3])
    return prompt

In [63]:
question = "Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?"
options = ["Some daughters get married and have their own family.", "Each daughter shares the same brother.", "Some brothers were not loved by family and moved away.", "None of above."]
bt_prompt = get_brainteaser_prompt(question, options)

In [64]:
inputs = tokenizer(bt_prompt, return_tensors="pt")

In [66]:
model.to(device)
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

['(C)']
