In [1]:
import sys
sys.version

'3.8.17 (default, Jul  5 2023, 21:04:15) \n[GCC 11.2.0]'

In [40]:
from typing import Optional, Union
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from datasets import Dataset # HuggingFace
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoModel
from transformers import pipeline

deberta_v3_large = 'microsoft/deberta-v3-large'

In [41]:
# !conda install evaluate -y

In [4]:
# !pip install evaluate

In [5]:
df_test = pd.read_csv('data/train.csv')
df_test = df_test.drop(columns="id")
df_test.shape

(200, 7)

In [6]:
df_train = pd.concat([pd.read_csv('data/osmulski_6000.csv'),
                      pd.read_csv('data/osmulski_extra_train.csv')])
df_train.shape

(6500, 7)

In [7]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [example['prompt']] * 5
    second_sentences = [example[option] for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [8]:
deberta_v3_large = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(deberta_v3_large)

dataset = Dataset.from_pandas(df_train)
dataset

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Dataset({
    features: ['prompt', 'A', 'B', 'C', 'D', 'E', 'answer', '__index_level_0__'],
    num_rows: 6500
})

In [9]:
tokenized_dataset = dataset.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_dataset

Map:   0%|          | 0/6500 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 6500
})

## Training

In [23]:
retrain = False

accuracy = evaluate.load("accuracy")

output_path = Path('./checkpoints')
training_args = TrainingArguments(
    warmup_ratio=0.8,
    learning_rate=5e-6,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    save_steps=5000,
    report_to='none',
    output_dir=str(output_path)
)

if not output_path.exists() or retrain:
    model = AutoModelForMultipleChoice.from_pretrained(deberta_v3_large)
    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        train_dataset=tokenized_dataset
    )
    trainer.train()
else:
    model = AutoModelForMultipleChoice.from_pretrained(output_path/'checkpoint-19500')

## Predicting on the Testset

In [21]:
tokenized_test_dataset = Dataset.from_pandas(df_test).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E'])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [35]:
clf = pipeline("question-answering", model, tokenizer=tokenizer)

The model 'DebertaV2ForMultipleChoice' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'ErnieMForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPT2ForQuestionAnswering', 'GPTNeoForQuestionAnswering', 'GPTNeoXForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'LayoutLMv3ForQuestionAnswering', 'LEDForQuestionAnswering', 'LiltForQuestionAnswering', 'LongformerForQuestionAnswerin

In [25]:
test_predictions = model.predict(tokenized_test_dataset).predictions
test_predictions[:4]

AttributeError: 'Dataset' object has no attribute 'view'

In [None]:
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_ids[:3]

In [28]:
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
predictions_as_answer_letters[:3]

NameError: name 'predictions_as_ids' is not defined

In [19]:
predictions_as_string = df_test['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]
predictions_as_string[:3]

['D B C', 'A E B', 'A C E']