In [None]:
!pip install peft
!pip install datasets==2.15
!pip install faiss-gpu # Use faiss-gpu if on GPU machine (faster)


In [None]:
import numpy as np
import torch
import pandas as pd
from datasets import load_dataset, Dataset, load_from_disk
from pathlib import Path
from transformers import AutoTokenizer, AutoModel, AutoModelForMultipleChoice, BertForMultipleChoice, TrainingArguments, Trainer, AutoModelForSequenceClassification


In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

device = get_default_device()

In [None]:
# Load the dataset
files = list(map(str, Path("/kaggle/input/wiki-20220301-en-sci").glob("*.parquet")))
ds = load_dataset("parquet", data_files=files, split="train")

In [None]:
def clean(x):
    x = x.split('References\n')[0]
    x = x.split('References \n')[0]
    x = x.split('See also\n')[0]
    x = x.split('See also \n')[0]
    x = x.split('External links')[0]
    return x



In [None]:
clean_dataset = Dataset.from_dict({
    "title":ds['title'],
    "text":[clean(t) for t in ds['text']],
})

In [None]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
ret_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
ret_model = AutoModel.from_pretrained(model_ckpt)
ret_model.to(device).eval()

# Getting the final embedding from the model
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_ret_embeddings(text_list):
    encoded_input = ret_tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = ret_model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
#unused code for dividing the articles into chunks before converting into embeddings

# chunked_text = []
# for i in range(100):#len(clean_dataset['text'])):
# #     if(i%100==0):print(i)
#     print(i)    
    
#     encoded_input = ret_tokenizer(
#         clean_dataset['text'][i], padding=True, truncation=False, return_tensors="pt"
#     )
#     num_tokens = len(encoded_input['input_ids'][0])
    
#     chunksize = int(450/num_tokens * len(clean_dataset['text'][i]))

#     frac = num_tokens/450 - num_tokens//450

#     if(num_tokens <= 450*1.5): num_chunks = 1
#     else:
#         if(frac > 0.5): 
#             num_chunks = num_tokens//450 + 1

#         else:
#             num_chunks = num_tokens//450
        
#     for j in range(num_chunks):
#         if(j+1 < num_chunks): chunked_text.append(clean_dataset['text'][i][j*chunksize:(j+1)*chunksize])
#         else: chunked_text.append(clean_dataset['text'][i][(num_chunks-1)*chunksize : ])
            


In [None]:
ret_embeddings_dataset = clean_dataset.map(
    lambda x: {"embeddings": get_ret_embeddings(x["text"]).detach().cpu().numpy()[0]}
)


In [None]:
ret_embeddings_dataset.save_to_disk('/kaggle/working/retrieval-wiki-embeddings')