In [3]:
import pandas as pd
import numpy as np
import warnings
from transformers import AutoTokenizer
from transformers import AutoModel
from datasets import load_dataset
import torch
import pickle
import faiss
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset = load_dataset("scientific_papers", "pubmed", split="train")

In [32]:
model = AutoModel.from_pretrained('stanford-crfm/BioMedLM')

In [5]:
tokenizer = AutoTokenizer.from_pretrained('stanford-crfm/BioMedLM')

In [6]:
index = faiss.read_index('KB_100k_abstracts_03012024.faiss')

In [7]:
with open('abstract_map.pkl', 'rb') as pickle_file:
    abstract_map = pickle.load(pickle_file)

In [8]:
def get_embedding(token_ids):
    """Generate embedding for token IDs, handling sequences longer than the model's max length."""
    max_length = 1024 
    chunks = [token_ids[i:i + max_length] for i in range(0, len(token_ids), max_length)]

    embeddings = []
    with torch.no_grad():
        for chunk in chunks:
            if len(chunk) > 0:
                inputs = torch.tensor(chunk).unsqueeze(0)
                if inputs.size(1) > 0: 
                    outputs = model(inputs)
                    chunk_embedding = outputs.last_hidden_state.mean(dim=1)
                    embeddings.append(chunk_embedding)


    if len(embeddings) == 0:
        return np.zeros(model.config.hidden_size)  

    embeddings = torch.stack(embeddings).mean(dim=0)
    return embeddings.squeeze().numpy()

In [33]:
def convert_to_token_ids(chunks):
    """Convert tokenized text chunks to token IDs, ensuring they are within the model's range."""
    token_ids = []
    for chunk in chunks:
        id_ = tokenizer.convert_tokens_to_ids(chunk)
        if id_ < tokenizer.vocab_size:
            token_ids.append(id_)
    return token_ids

In [34]:
def search_abstracts(query, k=10):
    """Search for similar articles and return their abstracts."""
    tokenized_query = convert_to_token_ids(tokenizer.tokenize(query))
    query_embedding = get_embedding(tokenized_query)
    distances, indices = index.search(np.array([query_embedding]), k)
    
    return [abstract_map.get(i, "No similar article found") for i in indices[0] if i != -1]

In [58]:
query = "tumorigenic prostatic stroma and nontumorigenic prostatic epithelia"
results = search_abstracts(query)

In [63]:
for count, result in enumerate(results, 1):
    print(f"Result {count}:")
    print(result)
    print()

Result 1:
 the interplay of different cell types of origin and distinct oncogenic mutations may determine the tumor subtype . 
 we have recently found that although both basal and luminal epithelial cells can initiate prostate tumorigenesis , the latter are more likely to undergo transformation in response to a range of oncogenic events . 

Result 2:
 prostate stromal cells may play binary roles in the process of prostate cancer development . as the first to be encountered by infiltrating prostate cancer cells , prostate stromal cells form the first defense line against prostate cancer progression and metastasis . 
 however , interaction between prostate cancer and stromal cells may facilitate the formation of a tumor microenvironment favoring cancer cell growth and survival . to establish an experimental system for studying the interaction between cancer and stromal cells 
 , we isolated three matched pairs of normal and cancer - associated human prostate stromal clones . in this repo