In [45]:
from langchain_community.llms import Ollama
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.llms import Ollama

llm=Ollama(model="llama3.2:1b")


In [46]:
from langchain.document_loaders import PyPDFLoader
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("us_census/acsbr-015.pdf"),
    PyPDFLoader("us_census/acsbr-016.pdf"),
    PyPDFLoader("us_census/acsbr-017.pdf"),
    PyPDFLoader("us_census/p70-178.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [47]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [48]:
splits=text_splitter.split_documents(docs)

In [49]:
len(splits) # 4 splits

201

In [50]:
persist_directory = 'docs/chroma/'
embedding= OllamaEmbeddings(model="llama3.2:1b")
vectordb = Chroma.from_documents(documents=splits[:3],persist_directory=persist_directory, embedding=embedding)

In [51]:
print(vectordb._collection.count())

4


In [52]:
question = "is there an email i can ask for help"

In [53]:
docs = vectordb.similarity_search(question)
print(docs[0].page_content)

employment from 2021 to 2022. More information is available at 
<www.bls.gov/opub/mlr/2023/article/unemployment-rate-returned-
to-its-prepandemic-level-in-2022.htm>.
2 Juliette Cubanski et al., “What Happens When COVID-19 
Emergency Declarations End? Implications for Coverage, Costs, and 
Access?,” Kaiser Family Foundation, 2023, <www.kff.org/coronavirus-
covid-19/issue-brief/what-happens-when-covid-19-emergency-
declarations-end-implications-for-coverage-costs-and-access/>.
3 For more information, refer to the American Rescue Plan Act, 
P.L. 117-2, March 11, 2021, <www.congress.gov/bill/117th-congress/
house-bill/1319/text>.
of the Midwest.4, 5 Kentucky, Maine, and New Mexico 
created state-based health insurance marketplaces on 
November 1, 2021, to replace their previously federally 
run exchanges.6 State and federal policies designed to 
increase public coverage may also affect the supply 
and demand for private coverage. As a result, a variety 
of changes in coverage rates are pos

In [54]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

qa_chain

RetrievalQA(verbose=False, combine_documents_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=Ollama(model='llama3.2:1b'), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='{page_content}'), document_variable_name='context'), retriever=VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7c7ed8040550>, search_kwargs={}))

In [55]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, 
don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [56]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [57]:
question = "US Data for 2022?"

In [58]:
result = qa_chain({"query": question})

In [59]:
result["result"]

'Thanks for asking! Based on the context provided, it appears that the US employment rate returned to its pre-pandemic level in 2022.'

In [60]:
result["source_documents"][0]

Document(metadata={'author': 'U.S. Census Bureau', 'creationdate': '2023-09-09T07:52:17-04:00', 'creator': 'Adobe InDesign 18.2 (Windows)', 'keywords': 'acsbr-015', 'moddate': '2023-09-12T14:44:47+01:00', 'page': 0, 'page_label': '1', 'producer': 'Adobe PDF Library 17.0', 'source': 'us_census/acsbr-015.pdf', 'title': 'Health Insurance Coverage Status and Type by Geography: 2021 and 2022', 'total_pages': 18, 'trapped': '/false'}, page_content='employment from 2021 to 2022. More information is available at \n<www.bls.gov/opub/mlr/2023/article/unemployment-rate-returned-\nto-its-prepandemic-level-in-2022.htm>.\n2 Juliette Cubanski et al., “What Happens When COVID-19 \nEmergency Declarations End? Implications for Coverage, Costs, and \nAccess?,” Kaiser Family Foundation, 2023, <www.kff.org/coronavirus-\ncovid-19/issue-brief/what-happens-when-covid-19-emergency-\ndeclarations-end-implications-for-coverage-costs-and-access/>.\n3 For more information, refer to the American Rescue Plan Act, \n

In [61]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [62]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [63]:
question = "US Data for 2022?"
result = qa({"question": question})

In [64]:
result["answer"]

"I don't know if the US data for 2022 shows an increase or decrease in employment from 2021 to 2022."