In [None]:
!pip install faiss-gpu
!pip install pytorch_metric_learning
!pip install transformers

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting pytorch_metric_learning
  Downloading pytorch_metric_learning-2.3.0-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch_metric_learning
Successfully installed pytorch_metric_learning-2.3.0
Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [9

In [None]:
import faiss
from tqdm import tqdm
import numpy as np
import pickle as pkl
from transformers import AutoModel, AutoTokenizer
from sklearn.decomposition import PCA
import random

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/sapBERT-DUT-cambridge')
sapBERT_epochs = 3
finetune_epochs = 3
finetune_dataset = 'nl_wiki_bel_all'
model_directory_path = f'/content/drive/MyDrive/sapBERT-DUT-cambridge/results/medRoBERTa_sapBERT/{sapBERT_epochs}_epoch/ft_{finetune_epochs}_epoch'

In [None]:
cuis = []
concepts = []
with open('/content/drive/MyDrive/sapBERT-DUT-cambridge/ontologies/umls-dutch_v1.11_with_drugs_filtered-categories.csv') as f:
    # skip header
    next(f)
    for line in f:
        cui = line.split(',')[0]
        concept = line.split(',')[1]
        cuis.append(cui)
        concepts.append(concept)

In [None]:
from src.model_wrapper import (
    Model_Wrapper
)

In [None]:
model_wrapper = Model_Wrapper().load_model(
        path=f'{model_directory_path}',
        max_length=25,
        use_cuda=True,
)

In [None]:
def reduce_embedding_dim(embeddings):
    pca_train_set = random.sample(list(embeddings), k=int(len(embeddings) * 0.5))
    pca = PCA(n_components=256)
    pca.fit(pca_train_set)
    pkl.dump(pca, open(f'{model_directory_path}/pca', 'wb'))
    reduced_embeddings = np.array(pca.transform(embeddings), dtype=np.float32)
    return reduced_embeddings

In [None]:
tokenizer = model_wrapper.get_dense_tokenizer()
model = model_wrapper.get_dense_encoder()

# tokenizer = AutoTokenizer.from_pretrained('CLTL/MedRoBERTa.nl')
# model = AutoModel.from_pretrained('CLTL/MedRoBERTa.nl').to('cuda')

bs = 128
all_reps = []
for i in tqdm(np.arange(0, len(concepts), bs)):
    toks = tokenizer.batch_encode_plus(concepts[i:i+bs],
                                       padding="max_length",
                                       max_length=25,
                                       truncation=True,
                                       return_tensors="pt")
    toks_cuda = {}
    for k,v in toks.items():
       toks_cuda[k] = v.cuda()
    output = model(**toks_cuda)

    # output = model(**toks)
    cls_rep = output[0][:,0,:]

    all_reps.append(cls_rep.cpu().detach().numpy())
all_reps_emb = np.concatenate(all_reps, axis=0)

pkl.dump(all_reps_emb, open(f'{model_directory_path}/all_reps_emb', "wb"))


100%|██████████| 5880/5880 [16:57<00:00,  5.78it/s]


In [None]:
# Build faiss index from embeddings
logging.info(f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus")
embeddings = pkl.load(open(f'{model_directory_path}/all_reps_emb', 'rb'))
embeddings = reduce_embedding_dim(embeddings)

quantizer = faiss.IndexFlatL2(256)
index = faiss.IndexIVFFlat(quantizer, 256, 300)
index = faiss.index_cpu_to_all_gpus(index)
index.train(embeddings)


logging.info("Adding dataset embeddings to index")
for i in tqdm(range(0, embeddings.shape[0], 1000)):
    index.add(embeddings[i : i + 1000])

logging.info("Saving index")
faiss.write_index(faiss.index_gpu_to_cpu(index), f'{model_directory_path}/index')

logging.info("Index built and saved")

100%|██████████| 753/753 [00:00<00:00, 965.76it/s] 
