In [1]:
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from openai import OpenAI
from pinecone import Pinecone
from tqdm import tqdm

import os

load_dotenv()

True

# Load Document

In [21]:
# TODO: CHOOSE VERSION (7-11)
version = 8

data = open(f"data/documents-{version}B.txt", encoding='utf-8').readlines()
data[:10]

["{'text': 'Several prosthesis and techniques to reduce re-herniation have been proposed including implantation of an annular closure device ( ACD ) - Barricaid™ and an annular tissue repair system ( AR ) - Anulex-Xclose', 'url': 'http://www.ncbi.nlm.nih.gov/pubmed/30115053'}\n",
 "{'text': 'We found that cDCs from prediseased TCSle male mice express the IFN signature as female TCSle cDCs do. Estrogens are necessary but not sufficient to express this IFN signature, but high doses of E2 can compensate for other steroidal components.', 'url': 'http://www.ncbi.nlm.nih.gov/pubmed/29850618'}\n",
 "{'text': 'CONCLUSION: Adding figitumumab to standard chemotherapy failed to increase OS in patients with advanced nonadenocarcinoma NSCLC.', 'url': 'http://www.ncbi.nlm.nih.gov/pubmed/24888810'}\n",
 "{'text': 'Here we have analysed the frequency and function of MAIT cells in multiple myeloma (MM) patients. We show that MAIT cell frequency in blood is reduced compared to healthy adult donors, but 

# Split

In [3]:
character_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ". ", " ", ""], chunk_size=250, chunk_overlap=50)
token_splitter = SentenceTransformersTokenTextSplitter(tokens_per_chunk=250, chunk_overlap=50)

# Create Embedding

In [4]:
client = OpenAI()

def embedding_function(text, model="text-embedding-ada-002"):
    
    # Get the embedding for the text
    response = client.embeddings.create(
        input=text,
        model=model
    )
    
    # Extract the embedding from the response
    embedding = response.data[0].embedding
    
    return embedding

# Example usage
text = "This is an example text to convert into an embedding."
embedding = embedding_function(text)
print(len(embedding), embedding)

1536 [-0.02878861501812935, 0.013735744170844555, 3.454516263445839e-05, 0.0011869255686178803, 0.01313766185194254, 0.01137701328843832, -0.005654906388372183, -0.008608359843492508, -0.03010573983192444, -0.022525545209646225, -0.0051777842454612255, 0.040400829166173935, -0.004156339447945356, -0.0032323352061212063, 0.005587706342339516, 0.021100899204611778, 0.015724873170256615, 0.01123589277267456, 0.0184666458517313, -0.023721711710095406, -0.013534143567085266, -0.0028644134290516376, 0.001160885440185666, 0.0023268109653145075, -0.012667259201407433, -0.01388358511030674, 0.018117204308509827, -0.023896431550383568, -0.005244984291493893, -0.024743156507611275, 0.004048818722367287, -0.012075896374881268, 0.0003721217508427799, -0.014797508716583252, -0.009791085496544838, -0.00013219562242738903, 0.0039950585924088955, -0.029057415202260017, 0.02481035515666008, -0.022418024018406868, 0.00513746403157711, 0.015926474705338478, 0.015415752306580544, -0.01802312396466732, -0.0

# Insert to DB

In [22]:
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
index = pc.Index("rag-kak")
# index.delete(delete_all=True)

In [23]:
# WARNING: Long computation time! Consider limit to first several data for experimenting
limit = 10000

for i in tqdm(range(min(limit, len(data)))):
    docs = eval(data[i])['text']
    url = eval(data[i])['url']

    character_split_texts = character_splitter.split_text(docs)

    token_split_texts = []
    for text in character_split_texts:
        token_split_texts += token_splitter.split_text(text)

    idx = {}
    for text in token_split_texts:
        embedding = embedding_function(text)
        id = url.split('/')[-1]
        idx[id] = idx.get(id, 0) + 1
        index.upsert(
            vectors=[
                {
                "id": f"{id}-{idx[id]}", 
                "values": embedding, 
                "metadata": {"text": text, "url": url}
                }
            ],
            namespace='8B'
        )

  0%|          | 0/4194 [00:00<?, ?it/s]

100%|██████████| 4194/4194 [1:22:41<00:00,  1.18s/it]  


# Try Query and Get Data from DB

In [24]:
query = "How can PEGylation improve recombinant drugs?"

character_split_text = character_splitter.split_text(query)

token_split_texts = []
for text in character_split_text:
    token_split_texts += token_splitter.split_text(text)

result = index.query(
    vector=embedding_function(token_split_texts),
    namespace='8B',
    top_k=5,
    include_metadata=True
)['matches']

result

[{'id': '22672501-2',
  'metadata': {'text': '. coupled with their proven good safety profile these '
                       'findings could translate into a significant clinical '
                       'benefit.',
               'url': 'http://www.ncbi.nlm.nih.gov/pubmed/22672501'},
  'score': 0.807919621,
  'values': []},
 {'id': '23034634-1',
  'metadata': {'text': 'wapl - ag was found to increase the stability of '
                       'cohesin binding to polytene chromosomes. our data '
                       'suggest that increasing cohesin stability interferes '
                       'with pcg silencing at genes that are co - regulated by '
                       'cohesin and pcg proteins.',
               'url': 'http://www.ncbi.nlm.nih.gov/pubmed/23034634'},
  'score': 0.807397366,
  'values': []},
 {'id': '25941654-1',
  'metadata': {'text': 'anti - cd3 teplizumab and anti - cd3 otelixizumab have '
                       'been shown to provide c - peptide preservation.',


In [19]:
retrieved_documents = []

for match in result:
    retrieved_documents.append(match['metadata']['text'])

retrieved_documents

['By increasing the molecular mass of proteins and peptides and shielding them from proteolytic enzymes, PEGylation primarily improves pharmacokinetics and helps to prevent adverse drug reactions.',
 'ttachment of a chain of poly(ethylene glycol) (PEG) to a therapeutic protein, a process widely known as PEGylation, can lead to several beneficial effects. It has the potential to significantly delay aggregation of the protein by steric shielding, a frequently encountered issue in the development of protein drugs. Moreover, it can modify the pharmacokinetic profile of the PEGylated protein by delaying renal excretion, leading to a longer half-life (t1/2) of the drug. By steric hindrance, it can also inhibit interactions between the protein drug and proteases as well as the host immune system, thereby inhibiting inactivation of the PEGylated protein and also attenuating its immunogenicity.',
 'ttachment of a chain of poly(ethylene glycol) (PEG) to a therapeutic protein, a process widely kn

# Combine with LLM

In [20]:
def rag(query, retrieved_documents, model="gpt-4o"):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a knowledgeable healthcare research assistant. Your users are asking questions about information contained in a healthcare document. You will be shown the user's question and the relevant information from the healthcare document. Answer the question with support of the provided document."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    content = response.choices[0].message.content
    return content

output = rag(query=query, retrieved_documents=retrieved_documents)
print("Question:")
print(query)
print("\nAnswer:")
print(output)

Question:
How can PEGylation improve recombinant drugs?

Answer:
PEGylation, or the attachment of a chain of poly(ethylene glycol) (PEG) to a therapeutic protein, can significantly improve recombinant drugs in several ways:

1. **Increased Pharmacokinetics**: By increasing the molecular mass of proteins and peptides, PEGylation can enhance the pharmacokinetic profile of the drug. It delays renal excretion, resulting in a longer half-life of the drug in the body.

2. **Steric Shielding**: It provides steric shielding, which can significantly delay the aggregation of the protein, a common issue in protein drug development.

3. **Protection from Proteolytic Enzymes**: By hindering interactions between the drug and proteases, PEGylation helps in protecting the protein from proteolytic degradation.

4. **Reduced Immunogenicity**: The process can attenuate the immunogenicity of the therapeutic protein by inhibiting interactions with the host immune system.

5. **Prevention of Adverse Drug Re