In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceBgeEmbeddings
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate

from langchain.document_loaders import TextLoader


In [4]:
bge_emb = HuggingFaceBgeEmbeddings(
    model_name = "BAAI/bge-small-en-v1.5",
    model_kwargs = {"device": "cuda:0"},
    encode_kwargs = {'normalize_embeddings': True} # set true for cosine similarity
)

llm = OpenAI()

embeddings = HypotheticalDocumentEmbedder.from_llm(llm, bge_emb, prompt_key="web_search")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import langchain
langchain.debug = True

In [6]:
result = embeddings.embed_query("what items does McDonald make?")

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please write a passage to answer the question \nQuestion: what items does McDonald make?\nPassage:"
  ]
}


Authentication failed for https://api.smith.langchain.com/runs. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs', '{"detail":"Invalid auth"}')


[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [4.11s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " McDonald's is a fast food restaurant chain that is known worldwide. They offer a wide variety of menu items, including burgers, chicken sandwiches, fries, salads, breakfast items, desserts, and other food items. McDonald's burgers are some of the most popular menu items, with the Big Mac, Quarter Pounder, and McChicken being some of the most popular. They also serve breakfast items such as Egg McMuffins, hash browns, and oatmeal. The restaurant also serves fries, salads, and desserts like ice cream cones and sundaes. McDonald's also offers specialty coffee drinks such as lattes, cappuccinos, and mochas. In addition to food, McDonald's also sells toys and merchandise.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": {
    "token_usa

Authentication failed for https://api.smith.langchain.com/runs/83ade637-5456-4634-ba7f-dda0a4378d4f. HTTPError('401 Client Error: Unauthorized for url: https://api.smith.langchain.com/runs/83ade637-5456-4634-ba7f-dda0a4378d4f', '{"detail":"Invalid auth"}')


In [7]:
print(result)
len(result)

[-0.02870439551770687, -0.059107571840286255, 0.027235932648181915, -0.010620699264109135, 0.06756597012281418, 0.016326341778039932, 0.04961646720767021, -0.030611563473939896, 0.029528439044952393, -0.01554589532315731, -0.02479427121579647, -0.0386468879878521, -0.006050115451216698, 0.006420820020139217, 0.0596432127058506, -0.05484124645590782, 0.01711185835301876, -0.0931917279958725, -0.03472931310534477, -0.05109564960002899, 0.02616151049733162, -0.004458202049136162, -0.09814361482858658, 0.02067679911851883, 0.035161230713129044, -0.023943157866597176, 0.028302878141403198, -0.00454811891540885, -0.06604514271020889, -0.09280846267938614, 0.046109046787023544, -0.08480254560709, 0.06671711057424545, -0.07974421977996826, -0.036712996661663055, -0.0003396625106688589, 0.02294238656759262, -0.05383357033133507, 0.02877781167626381, -0.012947851791977882, 0.075503870844841, -0.004109569359570742, 0.034454964101314545, -0.028924571350216866, 0.0646887794137001, 0.010168796405196

384

# multiple doc and combine the embeddings 

In [8]:
multi_llm = OpenAI(n=4, best_of=4)
embeddings = HypotheticalDocumentEmbedder.from_llm(multi_llm, bge_emb, prompt_key="web_search")
results = embeddings.embed_query("what items does McDonald make?")

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please write a passage to answer the question \nQuestion: what items does McDonald make?\nPassage:"
  ]
}


[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [5.27s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nMcDonald's is a well-known fast-food chain that has been in business for decades. They offer a wide variety of food items, from classic hamburgers and french fries to breakfast items, salads, and desserts. Popular menu items include the classic Big Mac, Quarter Pounder with Cheese, Chicken McNuggets, Filet-O-Fish, and Egg McMuffin. In addition to these traditional options, McDonald's also offers a variety of seasonal items such as the McRib, McFlurry, and McChicken. McDonald's also offers a wide range of beverages, including sodas, coffee, milkshakes, and smoothies. Their menu is constantly changing and expanding to meet the needs of their customers, so there is always something new to try.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      },
      {
        "

In [9]:
print(results)
len(results)

[-0.03227103524841368, -0.032750103157013655, 0.027255034539848566, -0.02561376034282148, 0.07443972490727901, 0.03132109739817679, 0.06206090468913317, -0.03396920766681433, 0.028270956245251, -0.016058145556598902, 0.005569062064751051, -0.03018047846853733, -0.015615477343089879, 0.00018086071941070259, 0.06461800262331963, -0.04277115408331156, 0.015319403260946274, -0.08503232710063457, -0.035483020823448896, -0.04102374706417322, 0.028660528361797333, -0.002559071173891425, -0.0974446702748537, 0.01910031889565289, 0.05077056400477886, -0.02387690427713096, 0.039621824864298105, -0.008174045011401176, -0.06935716606676579, -0.09593155421316624, 0.03318844875320792, -0.06085190363228321, 0.05152677930891514, -0.0807821024209261, -0.05017206445336342, 0.007700761663727462, 0.020323173492215574, -0.056093198247253895, 0.020168200135231018, 0.0022689862817060202, 0.06427545379847288, -0.004010821576230228, 0.049534204415977, -0.026691722217947245, 0.06547426898032427, 0.0217779707163

384

# Custom prompts

In [10]:
prompt_template = PromptTemplate(
    input_variables=["question"],
    template='''Please answer the following question as a single food item.
        Question: {question}
        Answer:'''
)

prompt_template.format(question="what items does McDonald make?")

'Please answer the following question as a single food item.\n        Question: what items does McDonald make?\n        Answer:'

In [11]:
llm_chain = LLMChain(llm=llm, prompt=prompt_template)
embeddings = HypotheticalDocumentEmbedder(llm_chain=llm_chain,base_embeddings=bge_emb)
result3 = embeddings.embed_query("what items does McDonald make?")
print(result3)
len(result3)

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please answer the following question as a single food item.\n        Question: what items does McDonald make?\n        Answer:"
  ]
}


[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [607ms] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " Big Mac",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "completion_tokens": 2,
      "total_tokens": 27,
      "prompt_tokens": 25
    },
    "model_name": "text-davinci-003"
  },
  "run": null
}
[-0.04409582540392876, -0.06810589879751205, 0.004882005508989096, -0.07507438957691193, 0.061256855726242065, -0.00785149447619915, 0.01861572451889515, -0.03252463415265083, -0.006711803376674652, -0.03244766592979431, 0.0023166672326624393, -0.017370183020830154, 0.05157424136996269, -0.018291115760803223, 0.05330300331115723, 0.04405609890818596, 0.08549801260232925, -0.05806003510951996, -0.048251908272504807, -0.008680148050189018, 0.03713996708393097, -0.04763495922088623, -0.05635960027575493, 0.00472

384

# Using HyDE

In [12]:
from langchain.text_splitter   import CharacterTextSplitter
from langchain.vectorstores   import VectorStore
import os
from PyPDF2 import PdfReader

path = "/home/dosisiddhesh/LANGCHAIN_EXP/pdfs/"

pdf_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.pdf')]
print(f"Number of pdf files found : {len(pdf_files)}")
raw_text = ''
for pdf_file in pdf_files:
    myPdfReader = PdfReader(pdf_file)
    for page in myPdfReader.pages:
        raw_text += page.extract_text()

text_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=1500,
    chunk_overlap=200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)
print(f"Length of the chunks: {len(texts)}")




# docs = [TextLoader(doc_file).load() for doc_file in doc_dir]
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
# texts = text_splitter.split_documents(docs)

Number of pdf files found : 27
Length of the chunks: 1550


In [13]:
prompt = PromptTemplate(
    input_variables=["question"],
    template='''Please answer the following question related to medical domain.
        Question: {question}
        Answer:'''
)
llm_chain = LLMChain(llm=llm, prompt=prompt)
embeddings = HypotheticalDocumentEmbedder(llm_chain=llm_chain,base_embeddings=bge_emb)

In [14]:
from langchain.vectorstores import FAISS
document_search_space_faiss = FAISS.from_texts(texts, embeddings)

In [15]:
from langchain.vectorstores import Chroma
document_search_space_chroma = Chroma.from_texts(texts, embeddings )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
document_search_space_faiss.save_local("../Medical_FAISS_dropbox")

In [17]:
query = "What is the latest type of Dementia?"
docs = document_search_space_faiss.similarity_search(query, k=5)

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please answer the following question related to medical domain.\n        Question: What is the latest type of Dementia?\n        Answer:"
  ]
}


[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [2.77s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " The latest type of Dementia is known as Lewy Body Dementia (LBD). LBD is a progressive brain disorder that results in a decline in thinking, reasoning, and independent function due to abnormal deposits of a protein called alpha-synuclein in the brain. Symptoms of LBD may include memory problems, impaired movement, hallucinations, and sleep disturbances.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "completion_tokens": 75,
      "total_tokens": 103,
      "prompt_tokens": 28
    },
    "model_name": "text-davinci-003"
  },
  "run": null
}


In [18]:
docs

[Document(page_content='Abstract\nDementia with Lewy bodies and Parkinson’s disease dementia, jointly known as Lewy body \ndementia, are common neurodegenerative conditions. Patients with Lewy body dementia present \nwith a wide range of cognitive, neuropsychiatric, sleep, motor, and autonomic symptoms. \nPresentation varies between patients and can vary over time within an individual. Treatments can \naddress one symptom but worsen another, which makes disease management difficult. Symptoms \nare often managed in isolation and by different specialists, which makes high-quality care difficult \nto accomplish. Clinical trials and meta-analyses now provide an evidence base for the treatment of \ncognitive, neuropsychiatric, and motor symptoms in patients with Lewy body dementia. \nFurthermore, consensus opinion from experts supports the application of treatments for related \nconditions, such as Parkinson’s disease, for the management of common symptoms (eg, \nautonomic dysfunction) in p

# Retriever

In [19]:
retreiver = document_search_space_chroma.as_retriever(search_kwargs={'k':3})

In [20]:
retreiver

VectorStoreRetriever(tags=['Chroma', 'HypotheticalDocumentEmbedder'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7ff91fddae80>, search_kwargs={'k': 3})

In [21]:
retreiver.search_type

'similarity'

In [22]:
retreiver.search_kwargs

{'k': 3}

In [23]:
docs = retreiver.get_relevant_documents(query)
docs

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please answer the following question related to medical domain.\n        Question: What is the latest type of Dementia?\n        Answer:"
  ]
}


[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [3.04s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " The latest type of Dementia is called Frontotemporal Dementia (FTD). It is a complex neurological disorder that affects the frontal and temporal lobes of the brain. Symptoms can include personality and behavioral changes, impaired language skills, and difficulty with memory and reasoning. It is typically diagnosed in people between the ages of 45 and 65.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "completion_tokens": 71,
      "total_tokens": 99,
      "prompt_tokens": 28
    },
    "model_name": "text-davinci-003"
  },
  "run": null
}


[Document(page_content='our emerging understanding of the mechanisms by which  pathogenic protein effects at cellular level \ntranslate to abnormal neural network  physiology and ultimately, complex clinical symptoms. We \nconclude by outlining principles of management a nd prospects for disease modification.  \n 3 \n INTRODUCTION  \nFrontotemporal dementia (FTD) is a clinically, neuroanatomically and pathologically heterogen eous \ngroup of neurodegenerative diseases  that share a propensity to target the frontal and temporal lobes of \nthe brain1. Although substantially less common than Alzheimer’s disease, the disorders that comprise the \nFTD spectrum have disproportionate clinical and neurobiological importance. From a  clinical \nperspective, FTD usually presents as a disturbance of complex behaviour, affecting predominantly inter-\npersonal conduct  or language  (primary progressive aphasia, PPA) , often in middle life; it is the major \nyoung onset dementia besides Alzheimer’ s

In [24]:
from langchain.chains import RetrievalQA
retreiver = document_search_space_chroma.as_retriever()
retriever_chain = RetrievalQA.from_llm(llm=llm, retriever=retreiver, return_source_documents=True)


In [25]:

query = "How to recover from dementia?"
response = retriever_chain(query)
response

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "How to recover from dementia?"
}
[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please answer the following question related to medical domain.\n        Question: How to recover from dementia?\n        Answer:"
  ]
}


[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [2.29s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " Treatment for dementia is limited, but medications can be used to help manage symptoms. Treatments may also include lifestyle changes such as increasing physical activity, engaging in social activities, and consuming a healthy diet. Other treatments may include cognitive behavioral therapy, speech therapy, and occupational therapy. Additionally, family and caregivers can provide support and assistance to help manage the condition.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "completion_tokens": 71,
      "total_tokens": 95,
      "prompt_tokens": 24
    },
    "model_name": "text-davinci-003"
  },
  "run": null
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain]

{'query': 'How to recover from dementia?',
 'result': " There is no cure for dementia, however, there are treatments and strategies available that can help manage the individual's symptoms, improve quality of life, and reduce caregiver burden. These treatments and strategies include cognitive stimulation therapy, exercise, physical and occupational therapy, speech-language pathology assessments, and medications known as cholinesterase inhibitors.",
 'source_documents': [Document(page_content='for social engagement. 1B (90%)\n9b. We recommend support for educational attainment, particularly in early life (1B) but also for ongoing educational experiences in mid and later\nlife. 1C (98%)\nFrailty\n10. We recommend that interventions to manage frailty be used to reduce the overall burden of dementia in older adults. 1B (81%)\nMedications\n11a. Exposure to medications known to exhibit highly anticholinergic properties should be minimized in older persons. Alternative medications\nshould be 

# User Defined VectorDB

In [26]:
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain.vectorstores.pgvector import DistanceStrategy
import os
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [27]:
import sys
sys.path.append("..")

In [28]:
from do_not_share import CONNECTION_STRING
from do_not_share import CONNECTION_STRING_2

In [29]:
embeddings = OpenAIEmbeddings()
embeddings2 = HuggingFaceEmbeddings(
                 model_name="thenlper/gte-small",
            )

In [30]:
db_openai = PGVector(
	connection_string=CONNECTION_STRING,
	embedding_function=embeddings,
	collection_name="my_collection",
	distance_strategy=DistanceStrategy.COSINE,
) #<-----------works

db_hf = PGVector(
	connection_string=CONNECTION_STRING,
	embedding_function=embeddings2,
	collection_name="my_collection",
	distance_strategy=DistanceStrategy.COSINE,
) #<---xxxxxxxxxxxxxxx


db2_openai = PGVector(
	connection_string=CONNECTION_STRING_2,
	embedding_function=embeddings,
	collection_name="pubmed",
	distance_strategy=DistanceStrategy.COSINE,
) #<---xxxxxxxxxxxxxxx

db2_hf = PGVector(
	connection_string=CONNECTION_STRING_2,
	embedding_function=embeddings2,
	collection_name="pubmed",
	distance_strategy=DistanceStrategy.COSINE,
) #<-----------works

In [31]:
db_openai.similarity_search("covid-19")

[Document(page_content='COVID\\u201019 \\u0e40\\u0e1b\\u0e47\\u0e19\\u0e01\\u0e32\\u0e23\\u0e28\\u0e36\\u0e01\\u0e29\\u0e32\\u0e41\\u0e1a\\u0e1a\\u0e08\\u0e33\\u0e25\\u0e2d\\u0e07\\u0e17\\u0e32\\u0e07\\u0e04\\u0e13\\u0e34\\u0e15\\u0e28\\u0e32\\u0e2a\\u0e15\\u0e23\\u0e4c\\u0e17\\u0e35\\u0e48\\u0e15\\u0e31\\u0e49\\u0e07\\u0e2a\\u0e21\\u0e21\\u0e15\\u0e34\\u0e10\\u0e32\\u0e19\\u0e17\\u0e35\\u0e48\\u0e41\\u0e15\\u0e01\\u0e15\\u0e48\\u0e32\\u0e07\\u0e01\\u0e31\\u0e19\\u0e40\\u0e01\\u0e35\\u0e48\\u0e22\\u0e27\\u0e01\\u0e31\\u0e1a\\u0e1e\\u0e32\\u0e23\\u0e32\\u0e21\\u0e34\\u0e40\\u0e15\\u0e2d\\u0e23\\u0e4c\\u0e41\\u0e1a\\u0e1a\\u0e08\\u0e33\\u0e25\\u0e2d\\u0e07\\u0e17\\u0e35\\u0e48\\u0e2a\\u0e33\\u0e04\\u0e31\\u0e0d \\u0e1c\\u0e25\\u0e01\\u0e32\\u0e23\\u0e27\\u0e34\\u0e08\\u0e31\\u0e22\\u0e23\\u0e30\\u0e1a\\u0e38\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07\\u0e2a\\u0e2d\\u0e14\\u0e04\\u0e25\\u0e49\\u0e2d\\u0e07\\u0e01\\u0e31\\u0e19\\u0e27\\u0e48\\u0e32\\u0e01\\u0e32\\u0e23\\u0e01\\u0e31\\u0e01\\u0e15\

In [32]:
db_hf.similarity_search("covid-19")

DataError: (psycopg2.errors.DataException) different vector dimensions 1536 and 384

[SQL: SELECT langchain_pg_embedding.collection_id AS langchain_pg_embedding_collection_id, langchain_pg_embedding.embedding AS langchain_pg_embedding_embedding, langchain_pg_embedding.document AS langchain_pg_embedding_document, langchain_pg_embedding.cmetadata AS langchain_pg_embedding_cmetadata, langchain_pg_embedding.custom_id AS langchain_pg_embedding_custom_id, langchain_pg_embedding.uuid AS langchain_pg_embedding_uuid, langchain_pg_embedding.embedding <=> %(embedding_1)s AS distance 
FROM langchain_pg_embedding JOIN langchain_pg_collection ON langchain_pg_embedding.collection_id = langchain_pg_collection.uuid 
WHERE langchain_pg_embedding.collection_id = %(collection_id_1)s::UUID ORDER BY distance ASC 
 LIMIT %(param_1)s]
[parameters: {'embedding_1': '[-0.013254907913506031,-0.006138861179351807,0.08671516925096512,-0.017247946932911873,0.016995076090097427,0.030227595940232277,0.09061761200428009, ... (7778 characters truncated) ... 12,0.002833177801221609,0.009655985049903393,0.03879620134830475,-0.03827172517776489,-0.06026271730661392,0.048488982021808624,0.012769239023327827]', 'collection_id_1': UUID('14ba3944-9c65-4b17-95e0-fd402ed4fd97'), 'param_1': 4}]
(Background on this error at: https://sqlalche.me/e/20/9h9h)

In [None]:
db2_openai.similarity_search("covid19")

DataError: (psycopg2.errors.DataException) different vector dimensions 384 and 1536

[SQL: SELECT langchain_pg_embedding.collection_id AS langchain_pg_embedding_collection_id, langchain_pg_embedding.embedding AS langchain_pg_embedding_embedding, langchain_pg_embedding.document AS langchain_pg_embedding_document, langchain_pg_embedding.cmetadata AS langchain_pg_embedding_cmetadata, langchain_pg_embedding.custom_id AS langchain_pg_embedding_custom_id, langchain_pg_embedding.uuid AS langchain_pg_embedding_uuid, langchain_pg_embedding.embedding <=> %(embedding_1)s AS distance 
FROM langchain_pg_embedding JOIN langchain_pg_collection ON langchain_pg_embedding.collection_id = langchain_pg_collection.uuid 
WHERE langchain_pg_embedding.collection_id = %(collection_id_1)s::UUID ORDER BY distance ASC 
 LIMIT %(param_1)s]
[parameters: {'embedding_1': '[-0.010619975326236507,-0.022180709360818767,0.008566206078949222,-0.020431693065717082,-0.01098435341060851,0.013819880392862784,-0.0179009190356484 ... (32540 characters truncated) ... 0.034026320061761095,-0.035669336949707,0.02082919643048654,-0.0034019694633152585,-0.016721657935911967,-0.003663659457851915,-0.004365916240468248]', 'collection_id_1': UUID('51d06ccb-b2c5-4757-a50b-b4cbc1109b1f'), 'param_1': 4}]
(Background on this error at: https://sqlalche.me/e/20/9h9h)

In [33]:
db2_hf.similarity_search("covid-19", k=2)

[Document(page_content='# ArticleTitle\nCoronavirus Disease 2019 (COVID-19): Prevention and Control in the Radiology Department.\n AbstractText\n\n AuthorList\nHamm, Rebecca\n ArticleId\n91/5/485\n PubMedPubDate\n2020-5-9'),
 Document(page_content='# ArticleTitle\nCOVID-19 and Infection Disease and Health.\n AbstractText\n\n AuthorList\nMitchell, Brett G\n ArticleId\nS2468-0451(21)00023-7\n PubMedPubDate\n2021-5-2')]

In [34]:
retreiver_db = db_hf.as_retriever(search_kwargs={'k':3})
retreiver_db.search_type

'similarity'

### +Hyde 

In [35]:
hyde_embedding_openai = HypotheticalDocumentEmbedder.from_llm(llm, embeddings, prompt_key="web_search")
hyde_embedding_gte = HypotheticalDocumentEmbedder.from_llm(llm, embeddings2, prompt_key="web_search")

#### CacheBackedEmbeddings
Embeddings can be stored or temporarily cached to avoid needing to recompute them.

In [37]:
# not tested yet
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore


store = LocalFileStore("./cache_gte_pubmed/")
cached_hyde_embedding_gte = CacheBackedEmbeddings.from_bytes_store(
    hyde_embedding_gte, store, 
)

store2 = LocalFileStore("./cache_openai/")
cached_hyde_embedding_openai = CacheBackedEmbeddings.from_bytes_store(
    hyde_embedding_openai, store2, 
)

In [38]:
# the cache is empty prior to embedding
list(store.yield_keys())

[]

In [40]:
# db_hyde_openai = PGVector(
# 	connection_string=CONNECTION_STRING,
# 	embedding_function=hyde_embedding_openai,
# 	collection_name="my_collection",
# 	distance_strategy=DistanceStrategy.COSINE,
# )

# db_hyde_gte = PGVector(
#     connection_string=CONNECTION_STRING_2,
#     embedding_function=hyde_embedding_gte,
#     collection_name="pubmed",
#     distance_strategy=DistanceStrategy.COSINE,
# )

db_hyde_openai = PGVector(
	connection_string=CONNECTION_STRING,
	embedding_function=cached_hyde_embedding_openai,
	collection_name="my_collection",
	distance_strategy=DistanceStrategy.COSINE,
)

db_hyde_gte = PGVector(
    connection_string=CONNECTION_STRING_2,
    embedding_function=cached_hyde_embedding_gte,
    collection_name="pubmed",
    distance_strategy=DistanceStrategy.COSINE,
)

In [41]:
retreiver_openai = db_hyde_openai.as_retriever(search_kwargs={'k':3})
retreiver_gte = db_hyde_gte.as_retriever(search_kwargs={'k':3})

In [42]:
retreiver_openai.search_type

'similarity'

In [43]:
retriever_openai_chain = RetrievalQA.from_llm(llm=llm, retriever=retreiver_openai, return_source_documents=True)
retriever_gte_chain = RetrievalQA.from_llm(llm=llm, retriever=retreiver_gte, return_source_documents=True)


In [44]:
retriever_openai_chain("what is covid-19?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "what is covid-19?"
}
[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please write a passage to answer the question \nQuestion: what is covid-19?\nPassage:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [6.00s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " \n\nCovid-19 is a contagious virus that was first identified in 2019. The virus is believed to have originated in Wuhan, China, and spread quickly around the world. It is now a global pandemic that has infected millions of people and caused thousands of deaths. Symptoms of Covid-19 include a dry cough, fever, fatigue, and difficulty breathing. In some cases, people may also experience body aches, sore throat, and loss of smell and taste. It is important to practice social distancing, wear a face mask, and wash your hands often to h

{'query': 'what is covid-19?',
 'result': ' Covid-19 is a viral infection transmitted by respiratory droplet spread. It is caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). Common signs and symptoms can include fever, dry cough, fatigue, and sputum production. In some people it progresses to cause a life-threatening respiratory syndrome.',
 'source_documents': [Document(page_content='Cochrane\nLibrary\nTrusted evidence.\nInformed decisions.\nBetter health.\n\xa0\n\xa0\nCochrane Database of Systematic Reviews\nB A C K G R O U N D\n-\nDescription of the condition\nThe clinical syndrome coronavirus disease 2019 (COVID-19) is\na new, rapidly emerging zoonotic infectious disease caused by\nsevere acute respiratory syndrome coronavirus 2 (SARS-CoV-2;\nWHO 2020a). On 22 March 2020, the World Health Organization\n(WHO) declared the current COVID-19 outbreak to be a pandemic,\nwith the outbreak resulting in more than 119\xa0 million confirmed\ncases and over 2.5\xa0million

### Lost in the middle: The problem with long contexts
No matter the architecture of your model, there is a substantial performance degradation when you include 10+ retrieved documents. In brief: When models must access relevant information in the middle of long contexts, they tend to ignore the provided documents. See: https://arxiv.org/abs/2307.03172


To avoid this issue you can re-order documents after retrieval to avoid performance degradation.

In [46]:
docs = retreiver_openai.get_relevant_documents(query)
docs

[32;1m[1;3m[llm/start][0m [1m[1:llm:OpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Please write a passage to answer the question \nQuestion: How to recover from dementia?\nPassage:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:llm:OpenAI] [6.91s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " \nRecovering from dementia is a complex process, and there is no one-size-fits-all approach. That said, there are a few things you can do to help support recovery. \n\nFirst, it’s important to get a proper diagnosis from a medical professional so that the appropriate treatment plan can be created. Additionally, a healthy lifestyle that includes exercise, a balanced diet, and socialization can help improve cognitive function and reduce symptoms. \n\nMoreover, cognitive stimulation is important in the recovery process. Stimulating activities such as puzzles, crafts, reading, and music can help to stimulate the brain and improve memory. \n\nFinall

[Document(page_content='7/16/2018\n18/18\nNondrug behavior therapy has an important place in dementia management. The primary goals are to make\nthe patient’s life comfortable, uncomplicated, and safe. Preparing lists, schedules, calendars, and labels can\nbe helpful in the early stages. It is also useful to stress familiar routines, walks, and simple physical exercises.\nFor many demented patients, memory for events is worse than their ability to carry out routine activities,\nand they may still be able to take part in activities such as walking, bowling, dancing, singing, bingo, and golf.\nDemented patients o�en object to losing control over familiar tasks such as driving, cooking, and handling\nfinances. Attempts to help or take over may be greeted with complaints, depression, or anger. Hostile\nresponses on the part of the caregiver are counterproductive and sometimes even harmful. Reassurance,\ndistraction, and calm positive statements are more productive in this setting. Eventual

In [47]:
from langchain.document_transformers import LongContextReorder

reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(docs)

The `stuff` documents chain (“stuff” as in “to stuff” or “to fill”) is the most straightforward of the document chains.  
It takes a list of documents, inserts them all into a prompt and passes that prompt to an LLM.

This chain is well-suited for applications where documents are small and only a few are passed in for most calls

In [56]:
# We prepare and run a custom Stuff chain with reordered docs as context.
from langchain.chains import LLMChain, StuffDocumentsChain

# Override prompts
document_prompt = PromptTemplate(input_variables=["page_content"], template="{page_content}")
document_variable_name = "context"

stuff_prompt_override = """Given this text extracts:
-----
{context}
-----
Please answer the following question:
{query}"""

prompt = PromptTemplate(template=stuff_prompt_override, input_variables=["context", "query"])

# Instantiate the chain
llm_chain = LLMChain(llm=llm, prompt=prompt)

chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name,
)
answer = chain.invoke(input={'input_documents' : reordered_docs, 'query':query})
answer


[32;1m[1;3m[chain/start][0m [1m[1:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:StuffDocumentsChain > 2:chain:LLMChain] Entering Chain run with input:
[0m{
  "query": "How to recover from dementia?",
  "context": "7/16/2018\n18/18\nNondrug behavior therapy has an important place in dementia management. The primary goals are to make\nthe patient’s life comfortable, uncomplicated, and safe. Preparing lists, schedules, calendars, and labels can\nbe helpful in the early stages. It is also useful to stress familiar routines, walks, and simple physical exercises.\nFor many demented patients, memory for events is worse than their ability to carry out routine activities,\nand they may still be able to take part in activities such as walking, bowling, dancing, singing, bingo, and golf.\nDemented patients o�en object to losing control over familiar tasks such as driving, cooking, and handling\nfinances. Attempts to help or

{'input_documents': [Document(page_content='7/16/2018\n18/18\nNondrug behavior therapy has an important place in dementia management. The primary goals are to make\nthe patient’s life comfortable, uncomplicated, and safe. Preparing lists, schedules, calendars, and labels can\nbe helpful in the early stages. It is also useful to stress familiar routines, walks, and simple physical exercises.\nFor many demented patients, memory for events is worse than their ability to carry out routine activities,\nand they may still be able to take part in activities such as walking, bowling, dancing, singing, bingo, and golf.\nDemented patients o�en object to losing control over familiar tasks such as driving, cooking, and handling\nfinances. Attempts to help or take over may be greeted with complaints, depression, or anger. Hostile\nresponses on the part of the caregiver are counterproductive and sometimes even harmful. Reassurance,\ndistraction, and calm positive statements are more productive in th

In [55]:
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain( llm=llm, chain_type="stuff")
# chain = prompt | model | ...
answer = chain.run(input_documents = reordered_docs, question=query)
answer

[32;1m[1;3m[chain/start][0m [1m[1:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:StuffDocumentsChain > 2:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "How to recover from dementia?",
  "context": "7/16/2018\n18/18\nNondrug behavior therapy has an important place in dementia management. The primary goals are to make\nthe patient’s life comfortable, uncomplicated, and safe. Preparing lists, schedules, calendars, and labels can\nbe helpful in the early stages. It is also useful to stress familiar routines, walks, and simple physical exercises.\nFor many demented patients, memory for events is worse than their ability to carry out routine activities,\nand they may still be able to take part in activities such as walking, bowling, dancing, singing, bingo, and golf.\nDemented patients o�en object to losing control over familiar tasks such as driving, cooking, and handling\nfinances. Attempts to help

' Unfortunately, dementia is a progressive condition with no known cure. Treatment is focused on managing symptoms, improving quality of life, and providing support for caregivers.'