In [None]:
!pip install peft
!pip install datasets
!pip install evaluate

In [None]:
from typing import Optional, Union
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoModel, BertForMultipleChoice
import evaluate


In [None]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

device = get_default_device()

In [None]:
deberta = 'microsoft/deberta-v3-base'
longformer = 'allenai/longformer-base-4096'

# original_model = BertForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
# original_model = AutoModelForMultipleChoice.from_pretrained(deberta)
# tokenizer = AutoTokenizer.from_pretrained(deberta, model_max_length=512)
original_model = AutoModelForMultipleChoice.from_pretrained(longformer)
tokenizer = AutoTokenizer.from_pretrained(longformer, model_max_length=768)

original_model.to(device)

In [None]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(examples):

    first_sentences = [[f"""### CONTEXT: {examples['context'][i]} """] * 5 for i in range(len(examples['prompt']))]
    second_sentences = [[f"""### QUESTION: {examples['prompt'][i]} ### OPTION: {examples[option][i]}""" for option in 'ABCDE'] for i in range(len(examples['A']))]

    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    tokenized_examples = tokenizer(first_sentences, second_sentences, padding=True, truncation=True, return_tensors="pt").to(device)
    tokenized_examples = {k: [v[i : i + 5] for i in range(0, len(v), 5)] for k, v in tokenized_examples.items()}
    tokenized_examples['labels'] = [option_to_index[examples['answer'][i]] for i in range(len(examples['answer']))]
    
    return tokenized_examples

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch
    



In [None]:
train_df = pd.read_csv('/kaggle/input/llm-science-exam-dataset-w-context-extended/15k_gpt3.5-turbo.csv')
train_df.dropna(inplace=True)
    
dataset = Dataset.from_pandas(train_df.iloc[:]).drop(columns=['id'])
train_test_dataset = dataset.train_test_split(test_size=0.1)

tokenized_dataset = train_test_dataset.map(preprocess, remove_columns=['context', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer'], batched=True, batch_size=500)


In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=8,
    target_modules=["query", "value"],
#     target_modules=["query_proj", "value_proj"],
    lora_dropout=0.05,
    bias="none",
)

In [None]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
peft_training_args = TrainingArguments(
    per_device_train_batch_size=1,
#     gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=8,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    output_dir='.',
    label_names = ["labels"],
    fp16=True,
#     gradient_checkpointing=True
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']
)

In [None]:
'''
try longformer with 768 max length and all data
'''

In [None]:
peft_trainer.train()

peft_model_path="./mcq_model"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
import json
with open('training_log.json', 'w') as fp:
    json.dump(peft_trainer.state.log_history, fp)

In [None]:
# from IPython.display import FileLink
# FileLink(r'output.zip')

