In [1]:
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from openai import OpenAI
from tqdm import tqdm
from pymongo.mongo_client import MongoClient

import os

load_dotenv()

True

# Load Document

In [2]:
# TODO: CHOOSE VERSION (7-11)
version = 7

data = open(f"data/documents-{version}B.txt", encoding='utf-8').readlines()
data[:10]

["{'text': 'All the patients presented a psychomotor retardation due to an obstructive hydrocephalus.', 'url': 'http://www.ncbi.nlm.nih.gov/pubmed/22886034'}\n",
 "{'text': 'UPDtool: a tool for detection of iso- and heterodisomy in parent-child trios using SNP microarrays.', 'url': 'http://www.ncbi.nlm.nih.gov/pubmed/23589652'}\n",
 "{'text': 'HIV pre-exposure prophylaxis (PrEP) is a new approach that involves the ongoing use of antiretroviral medications by HIV-negative individuals to reduce the risk of HIV infection.', 'url': 'http://www.ncbi.nlm.nih.gov/pubmed/25987851'}\n",
 "{'text': 'Burosumab in X-linked hypophosphatemia: a profile of its use in the USA.', 'url': 'http://www.ncbi.nlm.nih.gov/pubmed/30459508'}\n",
 "{'text': 'The collagen matrix of human articular cartilage is an essentially permanent structure that has no significant turnover in adults, even with the occurrence of disease.', 'url': 'http://www.ncbi.nlm.nih.gov/pubmed/27384346'}\n",
 "{'text': 'Leptin belongs to 

# Split

In [3]:
character_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ". ", " ", ""], chunk_size=250, chunk_overlap=50)
token_splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk=250, chunk_overlap=50)

  from tqdm.autonotebook import tqdm, trange


# Create Embedding

In [4]:
client = OpenAI()

def embedding_function(text, model="text-embedding-ada-002"):
    
    # Get the embedding for the text
    response = client.embeddings.create(
        input=text,
        model=model
    )
    
    # Extract the embedding from the response
    embedding = response.data[0].embedding
    
    return embedding

# Example usage
text = "This is an example text to convert into an embedding."
embedding = embedding_function(text)
print(len(embedding), embedding)

1536 [-0.02878861501812935, 0.013735744170844555, 3.454516263445839e-05, 0.0011869255686178803, 0.01313766185194254, 0.01137701328843832, -0.005654906388372183, -0.008608359843492508, -0.03010573983192444, -0.022525545209646225, -0.0051777842454612255, 0.040400829166173935, -0.004156339447945356, -0.0032323352061212063, 0.005587706342339516, 0.021100899204611778, 0.015724873170256615, 0.01123589277267456, 0.0184666458517313, -0.023721711710095406, -0.013534143567085266, -0.0028644134290516376, 0.001160885440185666, 0.0023268109653145075, -0.012667259201407433, -0.01388358511030674, 0.018117204308509827, -0.023896431550383568, -0.005244984291493893, -0.024743156507611275, 0.004048818722367287, -0.012075896374881268, 0.0003721217508427799, -0.014797508716583252, -0.009791085496544838, -0.00013219562242738903, 0.0039950585924088955, -0.029057415202260017, 0.02481035515666008, -0.022418024018406868, 0.00513746403157711, 0.015926474705338478, 0.015415752306580544, -0.01802312396466732, -0.0

# Insert to DB - MongoDB

In [5]:
uri = os.environ.get("MONGODB_URI")
# Create a new client and connect to the server
mongo_client = MongoClient(uri)
# Send a ping to confirm a successful connection
try:
    mongo_client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


### Before continuing, please make sure you have created the database and collection in MongoDB Website.
In the example, **PRICAI** is the database name and **Test** is the collection name.

In [9]:
# WARNING: Long computation time! Consider limit to first several data for experimenting
limit = 100

collection = mongo_client["PRICAI"]["Test"]

for i in tqdm(range(min(limit, len(data)))):
    docs = eval(data[i])['text']
    url = eval(data[i])['url']

    character_split_texts = character_splitter.split_text(docs)

    token_split_texts = []
    for text in character_split_texts:
        token_split_texts += token_splitter.split_text(text)

    idx = 0
    for text in token_split_texts:
        embedding = embedding_function(text)
        collection.insert_one({"id": f"{url.split('/')[-1]}-{idx}", "values": embedding, "metadata": {"text": docs, "url": url}})
        idx += 1

100%|██████████| 100/100 [00:50<00:00,  1.99it/s]


# Try Query and Get Data from DB

In [13]:
query = "Is collagen matrix of human articular cartilage changing with disease?"

character_split_text = character_splitter.split_text(query)

token_split_texts = []
for text in character_split_text:
    token_split_texts += token_splitter.split_text(text)

result = collection.find({"id": "22886034-0"})

for doc in result:
    print(doc)

{'_id': ObjectId('665dab293b44974b8cee0421'), 'id': '22886034-0', 'values': [-0.012413259595632553, 0.0017521341796964407, 0.0251785758882761, -0.017576757818460464, -0.011617871932685375, 0.010855082422494888, 0.013547664508223534, 0.0054340604692697525, -0.01495589129626751, -0.02078438550233841, -0.023483486846089363, 0.027538659051060677, 0.013091294094920158, 0.008573233149945736, -0.022127415984869003, -0.00651630898937583, 0.0349709689617157, 0.007888678461313248, 0.006535867694765329, -0.00841024424880743, -1.6961066648946144e-05, 0.015307947993278503, -0.0069694193080067635, -0.0367964468896389, -0.019154492765665054, -0.0013894831063225865, 0.01975429430603981, -0.030850600451231003, 0.01187865436077118, -0.017498522996902466, -0.01545137818902731, -0.019636942073702812, -0.013071735389530659, -0.016325000673532486, -0.01238718070089817, -0.02606523595750332, 0.012361102737486362, 0.00018753948097582906, 0.008025589399039745, 0.013625899329781532, -0.019050180912017822, -0.01

In [17]:
retrieved_documents = []

for doc in collection.find():
    retrieved_documents.append(doc["metadata"]["text"])
    
retrieved_documents

['All the patients presented a psychomotor retardation due to an obstructive hydrocephalus.',
 'UPDtool: a tool for detection of iso- and heterodisomy in parent-child trios using SNP microarrays.',
 'HIV pre-exposure prophylaxis (PrEP) is a new approach that involves the ongoing use of antiretroviral medications by HIV-negative individuals to reduce the risk of HIV infection.',
 'Burosumab in X-linked hypophosphatemia: a profile of its use in the USA.',
 'The collagen matrix of human articular cartilage is an essentially permanent structure that has no significant turnover in adults, even with the occurrence of disease.',
 'Leptin belongs to the adipokine family, which also contains adiponectin and resistin.',
 'Those bearing mutations had the classical triple A syndrome of achalasia, alacrima, adrenal abnormalities and a progressive neurological syndrome.',
 'We aimed to assess the activity and safety of pazopanib in patients with von Hippel-Lindau disease.',
 'However, it is not clea

# Combine with LLM

In [18]:
def rag(query, retrieved_documents, model="gpt-4o"):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a knowledgeable healthcare research assistant. Your users are asking questions about information contained in a healthcare document. You will be shown the user's question and the relevant information from the healthcare document. Answer the question with support of the provided document."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    content = response.choices[0].message.content
    return content

output = rag(query=query, retrieved_documents=retrieved_documents)
print("Question:")
print(query)
print("\nAnswer:")
print(output)

Question:
Is collagen matrix of human articular cartilage changing with disease?

Answer:
Yes, the document indicates that the collagen matrix of human articular cartilage is an essentially permanent structure with no significant turnover in adults, even with the occurrence of disease. Therefore, it does not change significantly with disease.
