## LLM Science Exam Model Training

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from typing import Optional, Union
from datasets import Dataset
from dataclasses import dataclass
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.trainer_callback import EarlyStoppingCallback
from transformers.trainer_callback import EarlyStoppingCallback, TrainerCallback, TrainerState, TrainerControl
from sklearn.model_selection import KFold

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
# Import public dataset and combine it with additional data generated in a similar method
train_df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')
train_df = train_df.drop(columns="id")
train_df = pd.concat([
    train_df,
    pd.read_csv('/kaggle/input/additional-train-data-for-llm-science-exam/extra_train_set.csv'),
])
train_df.reset_index(inplace=True, drop=True)
train_df.head()

Unnamed: 0,prompt,A,B,C,D,E,answer
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D


In [None]:
# The path of the model checkpoint we want to use - using debertav3 due to being the
# largest model feasible to train on avaliable resources
deberta_v3_large = '/kaggle/input/deberta-v3-large-hf-weights'
tokenizer = AutoTokenizer.from_pretrained(deberta_v3_large)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so question is copied 5 times before tokenizing with answers
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(example[option])
    # Tokenizer will turn our text into token IDs deBERTa can understand
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

In [5]:
# Following datacollator (adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice)
# will dynamically pad our questions at batch-time so we don't have to make every question the length
# of our longest question.


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch
    
# We don't need the optimizer.pt so we delete it regularly not to run out of space
class RemoveOptimizerCallback(TrainerCallback):
    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called after a checkpoint save.
        """
        if os.path.exists(state.best_model_checkpoint + '/optimizer.pt'):
            os.remove(state.best_model_checkpoint + '/optimizer.pt')

In [6]:
# Now we'll instatiate the model that we'll finetune on our public dataset, then use to
# make prediction on the private dataset.
model = AutoModelForMultipleChoice.from_pretrained(deberta_v3_large)

Some weights of the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights were not used when initializing DebertaV2ForMultipleChoice: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFor

In [7]:
trained_model_dir_path = '/kaggle/working'

# calcualates mean accurate precision for predictions made
def map3(y_true, y_pred):
    m = (y_true.reshape((-1,1)) == y_pred)
    return np.mean(np.where(m.any(axis=1), m.argmax(axis=1)+1, np.inf)**(-1))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions_sorted = np.argsort(-predictions, axis=1)[:, :3]
    return {'map3': map3(labels, predictions_sorted)}

In [8]:
best_score = -float('inf')

# Use KFold cross validation to fine tune deBERTa model
kf = KFold(n_splits=4, shuffle=True)
for fold, (idx_tr, idx_va) in enumerate(kf.split(train_df)):

    train_set = train_df.loc[idx_tr, ['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']]
    valid_set = train_df.loc[idx_va, ['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']]

    train_set = Dataset.from_pandas(train_set)
    tokenized_train = train_set.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
    valid_set = Dataset.from_pandas(valid_set)
    tokenized_valid = valid_set.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

    output_dir = trained_model_dir_path + f'/fold_{fold}'
    best_model_dir = trained_model_dir_path + '/best'

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        load_best_model_at_end=True,
        save_total_limit=1,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=50,
        logging_steps=5,
        warmup_ratio=0.8,
        learning_rate=5e-6,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=2,
        num_train_epochs=5,
        report_to='none',
        weight_decay=0.01,
        lr_scheduler_type='cosine',
        metric_for_best_model='map3',
    )

    model = AutoModelForMultipleChoice.from_pretrained(deberta_v3_large)

    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=6), RemoveOptimizerCallback()],
        compute_metrics=compute_metrics,
    )

    trainer.train()

    valid_pred = trainer.predict(tokenized_valid)

    best_metric = valid_pred.metrics['test_map3']

    if best_metric > best_score:
        best_score = best_metric
        print(f'new best score {best_score:.2f}')
        print('saving...')
        if not os.path.exists(best_model_dir):
            os.makedirs(best_model_dir)
        trainer.save_model(best_model_dir)

print(f'best score {best_score:.4f}')

  0%|          | 0/525 [00:00<?, ?ex/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/175 [00:00<?, ?ex/s]

Some weights of the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights were not used when initializing DebertaV2ForMultipleChoice: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFor

Step,Training Loss,Validation Loss,Map3
50,1.6525,1.606189,0.401905
100,1.574,1.606985,0.412381
150,1.632,1.606717,0.412381
200,1.5994,1.608763,0.410476
250,1.5807,1.608051,0.42
300,1.6359,1.60734,0.421905
350,1.6064,1.607672,0.45619
400,1.5522,1.606102,0.462857
450,1.5558,1.607211,0.48381
500,1.6191,1.60802,0.490476


new best score 0.69
saving...


  0%|          | 0/525 [00:00<?, ?ex/s]

  0%|          | 0/175 [00:00<?, ?ex/s]

Some weights of the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights were not used when initializing DebertaV2ForMultipleChoice: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFor

Step,Training Loss,Validation Loss,Map3
50,1.6776,1.608963,0.48
100,1.6557,1.608894,0.484762
150,1.6042,1.608877,0.491429
200,1.6102,1.608816,0.506667
250,1.6057,1.608667,0.511429
300,1.5881,1.608431,0.52381
350,1.6315,1.608855,0.518095
400,1.6383,1.609942,0.53619
450,1.5912,1.607604,0.537143
500,1.617,1.607857,0.549524


new best score 0.69
saving...


  0%|          | 0/525 [00:00<?, ?ex/s]

  0%|          | 0/175 [00:00<?, ?ex/s]

Some weights of the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights were not used when initializing DebertaV2ForMultipleChoice: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFor

Step,Training Loss,Validation Loss,Map3
50,1.5913,1.611462,0.251429
100,1.6486,1.611328,0.259048
150,1.562,1.611225,0.259048
200,1.5437,1.611021,0.272381
250,1.6117,1.610558,0.291429
300,1.6384,1.610353,0.295238
350,1.5787,1.610164,0.3
400,1.5953,1.60982,0.341905
450,1.5878,1.609524,0.355238
500,1.6307,1.609608,0.348571


  0%|          | 0/525 [00:00<?, ?ex/s]

  0%|          | 0/175 [00:00<?, ?ex/s]

Some weights of the model checkpoint at /kaggle/input/deberta-v3-large-hf-weights were not used when initializing DebertaV2ForMultipleChoice: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFor

Step,Training Loss,Validation Loss,Map3
50,1.6158,1.611034,0.289524
100,1.6677,1.611008,0.288571
150,1.6143,1.610923,0.295238
200,1.5499,1.610625,0.310476
250,1.5896,1.610849,0.324762
300,1.5599,1.610081,0.353333
350,1.7408,1.60994,0.347619
400,1.5372,1.610448,0.364762
450,1.6501,1.612765,0.36381
500,1.487,1.603107,0.467619


best score 0.6933


In [9]:
# Make predictions on our questions
tokenizer = AutoTokenizer.from_pretrained(best_model_dir)

test_df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')
test_df['answer'] = 'A' # dummy answer that allows us to preprocess the test datataset using functionality that works for the train set

tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, 10, shuffle=False, collate_fn=data_collator)

model = AutoModelForMultipleChoice.from_pretrained(best_model_dir).cuda()

model.eval()

test_predictions = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    test_predictions.append(outputs.logits.cpu().detach())

test_predictions = torch.cat(test_predictions)

  0%|          | 0/200 [00:00<?, ?ex/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [10]:
# The following function gets the indices of the highest scoring answers for each row
# and converts them back to our answer format (A, B, C, D, E)
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)
    

In [11]:
# Output predictions
predictions_to_map_output(test_predictions)

array(['D B E', 'A D C', 'A C E', 'C A D', 'D A E', 'B C A', 'A C B',
       'D B E', 'A C B', 'A E B', 'E B A', 'A B C', 'C E A', 'E D A',
       'B D C', 'B C E', 'E B D', 'E B C', 'A D C', 'E D A', 'D C E',
       'D C E', 'C E B', 'C A D', 'D E A', 'E D A', 'C E A', 'D C B',
       'E C A', 'C B A', 'B D E', 'E D C', 'E B D', 'D B E', 'C B E',
       'D E B', 'E A C', 'A D C', 'E D C', 'E A B', 'E A D', 'D C E',
       'C D A', 'C D E', 'D E C', 'A B D', 'B C E', 'C D B', 'E D A',
       'B D A', 'B E A', 'C E A', 'C A B', 'A D C', 'B A D', 'B E D',
       'C E D', 'C B A', 'D E A', 'A B E', 'B C A', 'D B E', 'C A D',
       'C E A', 'A D B', 'E D A', 'C A D', 'E C B', 'B E D', 'D E A',
       'C B A', 'E A D', 'D A C', 'B C A', 'D C A', 'B C E', 'A D C',
       'B A C', 'C B E', 'E C D', 'C E A', 'A D C', 'B D C', 'A D C',
       'C E B', 'D C E', 'D A B', 'A B C', 'E C A', 'D E B', 'A D B',
       'B A D', 'B E C', 'E D B', 'E B C', 'C A B', 'C B D', 'B D C',
       'B D C', 'C D