In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForMultipleChoice, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Loading the Data

In [2]:
sentence_puzzles = np.load('data/SP-train.npy', allow_pickle=True)
word_puzzles = np.load('data/WP-train.npy', allow_pickle=True)

In [3]:
sentence_puzzles[0]

{'id': 'SP-0',
 'question': 'Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?',
 'answer': 'Each daughter shares the same brother.',
 'distractor1': 'Some daughters get married and have their own family.',
 'distractor2': 'Some brothers were not loved by family and moved away.',
 'distractor(unsure)': 'None of above.',
 'label': 1,
 'choice_list': ['Some daughters get married and have their own family.',
  'Each daughter shares the same brother.',
  'Some brothers were not loved by family and moved away.',
  'None of above.'],
 'choice_order': [1, 0, 2, 3]}

In [4]:
sentence_puzzles_modified = np.array([
    {**d,
     'choice1': d['choice_list'][0],
     'choice2': d['choice_list'][1],
     'choice3': d['choice_list'][2],
     'choice4': d['choice_list'][3]} for d in sentence_puzzles
])

sentence_puzzles_modified[0]

{'id': 'SP-0',
 'question': 'Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?',
 'answer': 'Each daughter shares the same brother.',
 'distractor1': 'Some daughters get married and have their own family.',
 'distractor2': 'Some brothers were not loved by family and moved away.',
 'distractor(unsure)': 'None of above.',
 'label': 1,
 'choice_list': ['Some daughters get married and have their own family.',
  'Each daughter shares the same brother.',
  'Some brothers were not loved by family and moved away.',
  'None of above.'],
 'choice_order': [1, 0, 2, 3],
 'choice1': 'Some daughters get married and have their own family.',
 'choice2': 'Each daughter shares the same brother.',
 'choice3': 'Some brothers were not loved by family and moved away.',
 'choice4': 'None of above.'}

In [5]:
choices = ['choice1', 'choice2', 'choice3', 'choice4']
attributes = ['question', 'label'] + choices

sentence_puzzles_dict = {key: [puzzle[key] for puzzle in sentence_puzzles_modified] for key in attributes}
for key in ['question'] + choices:
    sentence_puzzles_dict[key] = [str(value) for value in sentence_puzzles_dict[key]]
sentence_puzzles_dict['label'] = [int(value) for value in sentence_puzzles_dict['label']]

sentence_puzzles_dataset = Dataset.from_dict(sentence_puzzles_dict)

In [6]:
ds_train_devtest = sentence_puzzles_dataset.train_test_split(test_size = 0.2)
ds_devtest = ds_train_devtest['test'].train_test_split(test_size=0.5)

sentence_puzzles_datasets = DatasetDict({
    'train': ds_train_devtest['train'],
    'validation': ds_devtest['train'],
    'test': ds_devtest['test']
})

sentence_puzzles_datasets

DatasetDict({
    train: Dataset({
        features: ['question', 'label', 'choice1', 'choice2', 'choice3', 'choice4'],
        num_rows: 405
    })
    validation: Dataset({
        features: ['question', 'label', 'choice1', 'choice2', 'choice3', 'choice4'],
        num_rows: 51
    })
    test: Dataset({
        features: ['question', 'label', 'choice1', 'choice2', 'choice3', 'choice4'],
        num_rows: 51
    })
})

### Preprocessing

In [88]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')



In [89]:
def preprocess_function(examples):
    questions = [[question] * 4 for question in examples['question']]
    answers = [[examples[choice][i] for choice in choices] for i in range(len(examples['question']))]
    
    questions = sum(questions, [])
    answers = sum(answers, [])

    tokenized_examples = tokenizer(questions, answers, truncation=True)
    return {key: [value[i:i + 4] for i in range(0, len(value), 4)] for key, value in tokenized_examples.items()}

In [102]:
examples = sentence_puzzles_datasets['train'][5:6]
features = preprocess_function(examples)
tokenizer.decode(features["input_ids"][0][0])

'[CLS] Tom is a clean freak but he never dries his hair after a shower. How is this possible?[SEP] His hair is dyed.[SEP]'

In [9]:
encoded_datasets = sentence_puzzles_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/405 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 405/405 [00:00<00:00, 2084.11 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 2033.50 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 1896.97 examples/s]


In [10]:
encoded_datasets.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

### Fine-tuning

In [11]:
model = AutoModelForMultipleChoice.from_pretrained('microsoft/deberta-v3-base')

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
args = TrainingArguments(
    'deberta-v3-base-finetuned-brainteaser',
    eval_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3
)

In [13]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [14]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [15]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [16]:
trainer.train()

  0%|          | 0/78 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
 33%|███▎      | 26/78 [40:51<55:34, 64.12s/it]

{'eval_loss': 0.6741086840629578, 'eval_accuracy': 0.7647058963775635, 'eval_runtime': 55.6678, 'eval_samples_per_second': 0.916, 'eval_steps_per_second': 0.072, 'epoch': 1.0}


                                                  
 67%|██████▋   | 52/78 [1:22:21<35:57, 82.99s/it]

{'eval_loss': 0.35971760749816895, 'eval_accuracy': 0.8627451062202454, 'eval_runtime': 63.7275, 'eval_samples_per_second': 0.8, 'eval_steps_per_second': 0.063, 'epoch': 2.0}


                                                  
100%|██████████| 78/78 [2:05:23<00:00, 96.45s/it]

{'eval_loss': 0.32489627599716187, 'eval_accuracy': 0.8627451062202454, 'eval_runtime': 55.5344, 'eval_samples_per_second': 0.918, 'eval_steps_per_second': 0.072, 'epoch': 3.0}
{'train_runtime': 7523.0457, 'train_samples_per_second': 0.162, 'train_steps_per_second': 0.01, 'train_loss': 0.5990286117944962, 'epoch': 3.0}





TrainOutput(global_step=78, training_loss=0.5990286117944962, metrics={'train_runtime': 7523.0457, 'train_samples_per_second': 0.162, 'train_steps_per_second': 0.01, 'total_flos': 293591400283656.0, 'train_loss': 0.5990286117944962, 'epoch': 3.0})

### Inference

In [104]:
tokenizer = AutoTokenizer.from_pretrained('deberta-v3-base-finetuned-brainteaser/checkpoint-78')
model = AutoModelForMultipleChoice.from_pretrained('deberta-v3-base-finetuned-brainteaser/checkpoint-78')

In [76]:
idx = 19

question = sentence_puzzles_datasets['test'][idx]['question']
choice1 = sentence_puzzles_datasets['test'][idx]['choice1']
choice2 = sentence_puzzles_datasets['test'][idx]['choice2']
choice3 = sentence_puzzles_datasets['test'][idx]['choice3']
choice4 = sentence_puzzles_datasets['test'][idx]['choice4']

print(question)
print('0:', choice1)
print('1:', choice2)
print('2:', choice3)
print('3:', choice4)
print('Correct:', sentence_puzzles_datasets['test'][idx]['label'])

Two men are found dead in their house, however there is no sign of one of them killing the other one, neither the sign of any other intruders. How's that possible?
0: They were frightened to death by the sound of thunder and lightning.
1: They're twins, and their genes let them die together.
2: They both comitted suiside.
3: None of above.
Correct: 2


In [77]:
inputs = tokenizer([[question, choice1], [question, choice2], [question, choice3], [question, choice4]], return_tensors='pt', padding=True)
labels = torch.tensor(0).unsqueeze(0)

outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
logits = outputs.logits

predicted_class = logits.argmax().item()
print('Prediction:', predicted_class)

Prediction: 3


### CodaLab Competition Answers Generation

In [79]:
test_data = np.load('data/SP_new_test.npy', allow_pickle=True)
test_data[0]

{'question': 'In a small village, two farmers are working in their fields - a diligent farmer and a lazy farmer. The hardworking farmer is the son of the lazy farmer, but the lazy farmer is not the father of the hardworking farmer. Can you explain this unusual relationship?',
 'choice_list': ['The lazy farmer is his mother.',
  'The lazy farmer is not a responsible father as he is lazy.',
  'The diligent farmer devoted himself to the farm and gradually forgot his father.',
  'None of above.']}

In [None]:
with open('test-results/answer_sen.txt', 'w') as f:
    for i in tqdm(range(len(test_data))):
        question = test_data[i]['question']
        choice1 = test_data[i]['choice_list'][0]
        choice2 = test_data[i]['choice_list'][1]
        choice3 = test_data[i]['choice_list'][2]
        choice4 = test_data[i]['choice_list'][3]

        inputs = tokenizer([[question, choice1], [question, choice2], [question, choice3], [question, choice4]], return_tensors='pt', padding=True)
        labels = torch.tensor(0).unsqueeze(0)

        outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
        logits = outputs.logits

        predicted_class = logits.argmax().item()
        f.write(str(predicted_class) + '\n')


100%|██████████| 120/120 [01:50<00:00,  1.08it/s]


In [103]:
test_data = np.load('data/WP_new_test.npy', allow_pickle=True)
test_data[0]

{'question': "What kind of stock doesn't have shares?",
 'choice_list': ['Small-cap stock.',
  'Livestock.',
  'Growth stock.',
  'None of above.']}

In [105]:
with open('test-results/answer_word.txt', 'w') as f:
    for i in tqdm(range(len(test_data))):
        question = test_data[i]['question']
        choice1 = test_data[i]['choice_list'][0]
        choice2 = test_data[i]['choice_list'][1]
        choice3 = test_data[i]['choice_list'][2]
        choice4 = test_data[i]['choice_list'][3]

        inputs = tokenizer([[question, choice1], [question, choice2], [question, choice3], [question, choice4]], return_tensors='pt', padding=True)
        labels = torch.tensor(0).unsqueeze(0)

        outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
        logits = outputs.logits

        predicted_class = logits.argmax().item()
        f.write(str(predicted_class) + '\n')


100%|██████████| 96/96 [00:54<00:00,  1.78it/s]
