In [74]:
# pip install -U langchain_community faiss-cpu langchain-huggingface pymupdf tiktoken langchain-ollama python-dotenv

In [75]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")

load_dotenv()

print(os.environ['LANGSMITH_API_KEY'])

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


In [76]:
# from langchain_community.document_loaders import CSVLoader
# loader = CSVLoader('./data/data.csv', csv_args={
#                                                         'delimiter': ',',
#                                                         'quotechar': '"',
#                                                         'fieldnames': ["Store","Brand","Product_Category","State","Sales"]
#                                                     })
# docs = loader.load()
# docs[1].page_content
# pdf_loader = PyMuPDFLoader('./data/kaynes.pdf')
# docs = pdf_loader.load()
# doc = docs[0]

In [77]:
import os
from langchain_community.document_loaders import CSVLoader

docs = []
for root, dirs, files in os.walk('./data'):
    for file in files:
        if file.endswith('.csv'):
            print('file to be added', file)
            loader = CSVLoader(file_path=os.path.join(root, file),
                                                    csv_args={
                                                        'delimiter': ',',
                                                        'quotechar': '"',
                                                        'fieldnames': ["Store","Brand","Product_Category","State","Sales"]
                                                    })
            pages = loader.load()
            # print(pages[0].page_content)
            docs.extend(pages)

len(docs)


file to be added data.csv


289

### Document Splitter

In [78]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_documents(docs)
len(chunks)

289

In [79]:
import tiktoken
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
# len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content))

### Document Vector Embeddings

In [80]:
from langchain_ollama import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")
single_vector = embeddings.embed_query("this is some text data")
len(single_vector)

768

In [81]:
index = faiss.IndexFlatL2(len(single_vector))
index.d

768

In [82]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

ids = vector_store.add_documents(documents=chunks)
# vector_store.index_to_docstore_id, len(ids)
# ids

In [83]:
# db_name = "q3 results"
# vector_store.save_local(db_name)

### Retrieval

In [84]:
question = "How much is the sales of Deluxe Supermarket in USA"
vector_store.search(query=question, search_type="similarity", k=5) # k is the number of results to return

[Document(id='4334f528-9820-4184-93b3-249740b75e72', metadata={'source': './data/data.csv', 'row': 87}, page_content='Store: Deluxe Supermarket\nBrand: Hermanos\nProduct_Category: Other\nState: USA\nSales: 100484090.49'),
 Document(id='50ea81e9-87bf-4353-9503-4aadc8567f5b', metadata={'source': './data/data.csv', 'row': 97}, page_content='Store: Deluxe Supermarket\nBrand: Jumbo\nProduct_Category: Food\nState: USA\nSales: 43831561.69'),
 Document(id='e3eeb30f-bc59-4e93-806d-5e74ed1a454d', metadata={'source': './data/data.csv', 'row': 37}, page_content='Store: Deluxe Supermarket\nBrand: Even Better\nProduct_Category: Household\nState: USA\nSales: 19539909.99'),
 Document(id='8edc7558-5aed-4eee-ae20-56a04b9afe33', metadata={'source': './data/data.csv', 'row': 7}, page_content='Store: Deluxe Supermarket\nBrand: Even Better\nProduct_Category: Drink\nState: USA\nSales: 20500599.77'),
 Document(id='c87d247a-3459-4757-8c31-2ab0e801faf7', metadata={'source': './data/data.csv', 'row': 27}, page_c

In [85]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 5,
                                                                        'fetch_k': 100,   
                                                                        'lambda_mult': 1})

# question = "how was the result of Anant Raj Limited in Q3 2024"
retrieved_docs = retriever.invoke(question)



### RAG with LLAMA 3.2 on OLLAMA

In [86]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

from langchain_ollama import ChatOllama
# model = ChatOllama(model="llama3.2", base_url="http://localhost:11434")
model = ChatOllama(model="deepseek-r1:14b", base_url="http://localhost:11434")
model.invoke("hi")

AIMessage(content='<think>\n\n</think>\n\nHello! How can I assist you today? 😊', additional_kwargs={}, response_metadata={'model': 'deepseek-r1:14b', 'created_at': '2025-02-03T06:46:21.268268Z', 'done': True, 'done_reason': 'stop', 'total_duration': 4196591834, 'load_duration': 811668167, 'prompt_eval_count': 4, 'prompt_eval_duration': 2908000000, 'eval_count': 16, 'eval_duration': 474000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-19f920fa-b4ad-4397-8c7e-432203636e39-0', usage_metadata={'input_tokens': 4, 'output_tokens': 16, 'total_tokens': 20})

In [87]:
prompt = hub.pull("rlm/rag-prompt")
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [88]:
prompt = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.
    Question: {question} 
    Context: {context} 
    Answer:
"""

prompt = ChatPromptTemplate.from_template(prompt)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="\n    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.\n    If you don't know the answer, just say that you don't know.\n    Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.\n    Question: {question} \n    Context: {context} \n    Answer:\n"), additional_kwargs={})])

In [89]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

# print(format_docs(docs))
rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# question = question = "How much is the sales of Deluxe Supermarket in USA"
# question = "What is cumulative sale of Even Better Brand in Deluxe Supermarket"
question = "How much is the total sale of Even Better Brand in Deluxe Supermarket"
output = rag_chain.invoke(question)
output

"<think>\nOkay, I need to figure out the total sales for the Even Better Brand in Deluxe Supermarket based on the provided context. Let me start by going through each piece of information step by step.\n\nFirst, I see that there are five entries, all from Deluxe Supermarket and the brand is Even Better. Each entry has a product category and state along with sales figures.\n\nI should list out each sale to make sure I don't miss any:\n\n1. Food category in Ontario: $25,737,667.35\n2. Household in Ontario: $5,929,428.36\n3. Drink in USA: $20,500,599.77\n4. Food in Canada: $52,675,649.46\n5. Household in USA: $19,539,909.99\n\nWait a second, I notice that entries 4 and 5 are from Canada and the USA respectively. The user is asking about Deluxe Supermarket's sales for Even Better Brand, but they didn't specify the state or country. However, all entries are under Deluxe Supermarket, so I should include all of them to get the total.\n\nNow, I'll add up each sale:\n\nStarting with $25,737,667