In [34]:
question = "what percentage of patients have pathogenic germline gene variants?"

In [35]:
# !pip install pypdf
from langchain_community.document_loaders import PyPDFLoader

In [36]:
loader = PyPDFLoader("documents/nihms-1828057.pdf")

In [37]:
pages = loader.load_and_split()

In [38]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

In [39]:
documents = text_splitter.split_documents(pages)

In [40]:
print(f"{len(pages)} vs {len(documents)}")

28 vs 88


In [41]:
import os
from dotenv import load_dotenv

load_dotenv(".env")
openai_api_key = os.getenv("openai_api_key")

In [42]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [43]:
# !pip install faiss-cpu
from langchain_community.vectorstores import FAISS
vector = FAISS.from_documents(documents, embeddings)

In [60]:
from langchain_community.vectorstores import Chroma

vector = Chroma.from_documents(documents, embeddings)

In [61]:
vector.similarity_search_with_score(question)

[(Document(page_content='to increasing rates of obesity and diabetes, which are potentially modifiable risk factors for \nPDAC.\nAbout 3.8% to 9.7% of patients with PDAC have pathogenic germline gene variants \nthat increase susceptibility to PDAC. These variants occur mostly in DNA damage repair \ngenes.25–27 The most common variants in PDAC include BRCA2 , BRCA1  (hereditary breast \nand ovary cancer syndrome), and ATM  (ataxia telangiectasia syndrome). Germline BRCA2 \nvariants are associated with an increased risk for PDAC (OR, 9.07 [95% CI, 6.33–12.98]) \nmore commonly than BRCA1  (OR, 2.95 [95% CI, 1.49–5.60]) or ATM  variants (OR, \n8.96 [95% CI, 6.12–12.98]).28 Uncommon (1% of patients with PDAC) but therapeutically \nimportant inheritable germline variants also occur in PDAC in mismatch repair deficiency \ngenes MLH1 , MSH2 , MSH6 , and PMS2  as part of Lynch syndrome.29 In 2019, the National \nComprehensive Cancer Network guidelines recommended that all patients newly diagnos

In [70]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
Answer the following question based on the provided context. 
If the context did not answer the question, do the followings:
1. mention that you could not find exact answer,
2. provide a summary of the context. 
In any case return a summary of the context.                                           
                                          
Context:
{context}

Question:
{question}

Your response:
                                                                                                                                                                        
""")

In [71]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(openai_api_key=openai_api_key)

In [72]:
from langchain_core.output_parsers import StrOutputParser
output_parser = StrOutputParser()

In [73]:
retriever = vector.as_retriever()

In [74]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup_and_retrieval = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
)

In [75]:
chain = setup_and_retrieval | prompt | llm | output_parser

In [76]:
response = chain.invoke(question)

In [77]:
print(f"question:\n{question}")

question:
what percentage of patients have pathogenic germline gene variants?


In [78]:
print(response)

Based on the provided context, it is stated that about 3.8% to 9.7% of patients with PDAC (pancreatic ductal adenocarcinoma) have pathogenic germline gene variants.


Based on the provided context, it is stated that about 3.8% to 9.7% of patients with PDAC (pancreatic ductal adenocarcinoma) have pathogenic germline gene variants.

In [79]:
!open documents/nihms-1*.pdf