In [41]:
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter



In [42]:
embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [43]:
# Load and split
loader = PyPDFLoader("data/raw/20240822_rbc_account_transfer.pdf")
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

In [45]:
# Create Chroma vector store
vectorstore = Chroma.from_documents(
    chunks,
    embedding=embeddings,
    persist_directory="data/chroma_db"
)
vectorstore.persist()

## LLM Model

In [46]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain.chains import RetrievalQA


In [47]:
model = OllamaLLM(model="mistral", streaming=True)

retriever = vectorstore.as_retriever()

qa_chain = RetrievalQA.from_chain_type(
    llm=model,
    retriever=retriever,
    return_source_documents=True
)

qa_chain

RetrievalQA(verbose=False, combine_documents_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=OllamaLLM(model='mistral'), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='{page_content}'), document_variable_name='context'), return_source_documents=True, retriever=VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x11ecc8ef0>, search_kwargs={}))

In [None]:
query = ""
response = qa_chain(query)

print("Answer:", response['result'])

# Optional: Show sources
for doc in response['source_documents']:
    print("\nSource snippet:\n", doc.page_content[:300])