In [None]:
!pip install peft
!pip install datasets==2.15
!pip install faiss-gpu # Use faiss-gpu if on GPU machine (faster)


In [None]:
import numpy as np
import torch
import pandas as pd
from datasets import load_dataset, Dataset, load_from_disk
from pathlib import Path
from transformers import AutoTokenizer, AutoModel, AutoModelForMultipleChoice, BertForMultipleChoice, TrainingArguments, Trainer, AutoModelForSequenceClassification
from transformers import pipeline

from typing import Optional, Union
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
import peft


In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

device = get_default_device()

In [None]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
ret_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
ret_model = AutoModel.from_pretrained(model_ckpt)
ret_model.to(device).eval()

# Getting the final embedding from the model
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_ret_embeddings(text_list):
    encoded_input = ret_tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = ret_model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
ret_embeddings_dataset = load_from_disk('/kaggle/input/retrieval-wiki-embeddings')

ret_embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
# from IPython.display import FileLink
# FileLink(r'retrieval-embeddings/data-00001-of-00002.arrow')


In [None]:
# reader_model = BertForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
# reader_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

deberta_v3 = 'microsoft/deberta-v3-base'
peft_adapter = '/kaggle/input/science-comp-trained-model'
reader_model = AutoModelForMultipleChoice.from_pretrained(deberta_v3)
reader_model.load_adapter(peft_adapter)

reader_tokenizer = AutoTokenizer.from_pretrained(peft_adapter)#, model_max_length=512)

reader_model.to(device).eval()


In [None]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(examples):

    first_sentences = [[f"""### CONTEXT: {examples['context'][i]} """] * 5 for i in range(len(examples['context']))]
    second_sentences = [[f"""### QUESTION: {examples['questions'][i]} ### OPTION: {examples[option][i]}""" for option in 'ABCDE'] for i in range(len(examples['A']))]

    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    tokenized_examples = reader_tokenizer(first_sentences, second_sentences, padding=True, truncation=True, return_tensors="pt").to(device)
    tokenized_examples = {k: [v[i : i + 5] for i in range(0, len(v), 5)] for k, v in tokenized_examples.items()}
    tokenized_examples['labels'] = [option_to_index[examples['answer'][i]] for i in range(len(examples['answer']))]
    
    return tokenized_examples

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch
    

    
test_df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')


In [None]:
rerank_tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base')
rerank_model.eval()

In [None]:
def get_best_context(question):
    question_embedding = get_ret_embeddings([question]).cpu().detach().numpy()

    scores, samples = ret_embeddings_dataset.get_nearest_examples(
        "embeddings", question_embedding, k=10)

    pairs = [(question, text) for text in samples['text']]
    with torch.no_grad():
        inputs = rerank_tokenizer(pairs, padding=True, 
                                  truncation=True, 
                                  return_tensors='pt')#, max_length=512)
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()

    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=True, inplace=True)
    
    return samples_df['text'].values[0]

    

In [None]:
batchsize = 10
numbatches = len(test_df['prompt'])//batchsize
predictions = []

for j in range(numbatches):
    print(j+1)
    best_contexts = []
    for i in range(batchsize):
        question = test_df['prompt'][j*batchsize+i]
        best_context = get_best_context(question)
        best_contexts.append(best_context)
        
    data = {'id': test_df['id'][j*batchsize: (j+1)*batchsize], 'questions': test_df['prompt'][j*batchsize: (j+1)*batchsize], 'context': best_contexts, 'A': test_df['A'][j*batchsize: (j+1)*batchsize], 'B': test_df['B'][j*batchsize: (j+1)*batchsize],
            'C': test_df['C'][j*batchsize: (j+1)*batchsize], 'D': test_df['D'][j*batchsize: (j+1)*batchsize], 'E': test_df['E'][j*batchsize: (j+1)*batchsize], 
            'answer': test_df['answer'][j*batchsize: (j+1)*batchsize]}
 
    df = pd.DataFrame(data)

    tokenized_test = Dataset.from_pandas(df.drop(columns=['id'])).map(preprocess, remove_columns=['questions', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'], batched=True)
    tokenized_test = tokenized_test.with_format("torch")
    
    outputs = reader_model(**{feature: tokenized_test[feature].to(device) for feature in tokenized_test.features})
    pred_answers = outputs['logits']
    
    predictions_as_ids = torch.argsort(-pred_answers, 1)[:,:3]
    predictions_as_options = np.array(list('ABCDE'))[predictions_as_ids.cpu()]
    predictions_as_string =  [' '.join(row) for row in predictions_as_options]
    predictions += predictions_as_string

In [None]:
submission = {'id': list(np.arange(len(predictions))), 'prediction': predictions}
submissiondf = pd.DataFrame(submission)

submissiondf['answer'] = test_df['answer']
submissiondf.to_csv('/kaggle/working/submission.csv', index=False)

# pd.read_csv('/kaggle/working/submission.csv')
