In [1]:
import json
import json_lines
import numpy as np
import torch

from datasets import Dataset
from collections import defaultdict
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

In [2]:
def get_riddlesense_prompt(question, options):
    prompt = \
"""
Question: {}

What is the correct answer to the question from the following choices?
Options: 
(A): {}
(B): {}
(C): {}
(D): {}
(E): {}""".format(question, options[0], options[1], options[2], options[3], options[4])
    return prompt

In [3]:
def load_og_data(file_path):
    raw_data = []
    with open(file_path, 'rb') as f: 
        for item in json_lines.reader(f):
            raw_data.append(item)
    
    data = defaultdict(list)
    for item in raw_data:
        data['question'].append(item['question']['stem'])
        data['options'].append([_['text'] for _ in item['question']['choices']])
        data['answer'].append(item['answerKey'])
    return data

# train_data = load_og_data("data/rs_train.jsonl")
# valid_data = load_og_data("data/rs_dev.jsonl")

In [4]:
answer_map = {0:'A', 1:'B', 2:'C', 3:'D', 4:'E'}
def load_adversarial_data(file_path):
    with open(file_path, 'r') as f: 
        raw_data = json.load(f)
    
    data = defaultdict(list)
    for item in raw_data:
        data['question'].append(item['question'])
        # data['options'].append(item['choice_list'])
        data['options'].append([str(option) for option in item['choice_list']])
        data['answer'].append(answer_map[item['label']])
    return data

train_data = load_adversarial_data("data/adversarial_rs_train.json")
valid_data = load_og_data("data/rs_dev.jsonl")

In [5]:
print(train_data['question'][1])
print(train_data['options'][1])
print(train_data['answer'][1])

print(valid_data['question'][1])
print(valid_data['options'][1])
print(valid_data['answer'][1])

A man in prison has to move a heavy bag of sand across a large field as a punishment. What can he add to the bag to make it less heavy?
['throw', 'bit', 'gallon', 'mouse', 'hole']
E
I saw a strange creature long, hard, and straight, thrusting into a round, dark opening preparing to discharge its load of lives puffing and squealing noises accompanied it, then a final screech as it slowed and stopped
['eye', 'space', 'gas station', 'finalist', 'train']
E


In [6]:
def preprocess_function(sample):
    text = get_riddlesense_prompt(sample['question'], sample['options'])

    model_inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
    )

    labels = tokenizer(sample['answer'], max_length=2, padding="max_length", truncation=True)
    labels = labels["input_ids"]
    labels = [l if l != tokenizer.pad_token_id else -100 for l in labels]
    model_inputs["labels"] = labels

    return model_inputs

In [7]:
model_name = 'google/flan-t5-large'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map='auto')

In [8]:
train_dataset = Dataset.from_dict(train_data)
train_tokenized = train_dataset.map(preprocess_function, batched=False, remove_columns=['question', 'options', 'answer'])
print(train_tokenized)

valid_dataset = Dataset.from_dict(valid_data)
valid_tokenized = valid_dataset.map(preprocess_function, batched=False, remove_columns=['question', 'options', 'answer'])
print(valid_tokenized)

Map:   0%|          | 0/7020 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 7020
})


Map:   0%|          | 0/1021 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1021
})


In [9]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=2
)

In [10]:
def compute_metrics(p):
    predictions, labels = p
    # https://discuss.huggingface.co/t/what-does-evalprediction-predictions-contain-exactly/1691/4
    logits = predictions[0]
    predictions = np.argmax(logits, axis=2)
    
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100 and l != 1]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100 and l != 1]
        for prediction, label in zip(predictions, labels)
    ]

    results = accuracy_score(y_true=true_labels, y_pred=true_predictions)
    return {
        "accuracy": results,
    }

In [11]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir = f"/usr1/data/devanshj/brainteaser/checkpoints/{model_name[7:]}_adversarial_finetune"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=3e-4, # higher learning rate
	per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    load_best_model_at_end=True,
    push_to_hub=True
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [12]:
trainer.train()
trainer.push_to_hub()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3741,0.805608,0.612145
2,0.0691,1.173466,0.639569
3,0.013,1.273287,0.628795


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


'https://huggingface.co/devanshrj/flan-t5-large_adversarial_finetune/tree/main/'

In [25]:
def get_brainteaser_prompt(question, options):
    prompt = \
"""
Question: {}

What is the correct answer to the question from the following choices?
Options: 
(A): {}
(B): {}
(C): {}
(D): {}""".format(question, options[0], options[1], options[2], options[3])
    return prompt

In [26]:
question = "Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?"
options = ["Some daughters get married and have their own family.", "Each daughter shares the same brother.", "Some brothers were not loved by family and moved away.", "None of above."]
bt_prompt = get_brainteaser_prompt(question, options)

In [29]:
inputs = tokenizer(bt_prompt, return_tensors="pt")

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

['B']
