In [162]:
# pip install -U langchain_community faiss-cpu langchain-huggingface pymupdf tiktoken langchain-ollama python-dotenv

In [163]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")

load_dotenv()

print(os.environ['LANGSMITH_API_KEY'])

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


In [164]:
from langchain_community.document_loaders import PyMuPDFLoader
# pdf_loader = PyMuPDFLoader('./data/kaynes.pdf')
# docs = pdf_loader.load()
# doc = docs[0]

In [165]:
import os

docs = []
for root, dirs, files in os.walk('./data'):
    for file in files:
        if file.endswith('.pdf'):
            print('file to be added', file)
            pdf_loader = PyMuPDFLoader(os.path.join(root, file))
            pages = pdf_loader.load()
            docs.extend(pages)

len(docs)


file to be added kaynes.pdf
file to be added anant-raj.pdf
file to be added polycab.pdf


61

### Document Splitter

In [166]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_documents(docs)
len(chunks)

218

In [167]:
import tiktoken
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
# len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content))

### Document Vector Embeddings

In [168]:
from langchain_ollama import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")
single_vector = embeddings.embed_query("this is some text data")
len(single_vector)

768

In [169]:
index = faiss.IndexFlatL2(len(single_vector))
index.d

768

In [170]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

ids = vector_store.add_documents(documents=chunks)
# vector_store.index_to_docstore_id, len(ids)
# ids

In [171]:
# db_name = "q3 results"
# vector_store.save_local(db_name)

### Retrieval

In [172]:
# question = "what is capex planned by kaynes"
# vector_store.search(query=question, search_type="similarity", k=5) # k is the number of results to return

In [173]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 5,
                                                                        'fetch_k': 100,   
                                                                        'lambda_mult': 1})

# question = "how was the result of Anant Raj Limited in Q3 2024"
retrieved_docs = retriever.invoke(question)



### RAG with LLAMA 3.2 on OLLAMA

In [174]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

from langchain_ollama import ChatOllama
model = ChatOllama(model="llama3.2", base_url="http://localhost:11434")
# model.invoke("hi")

In [179]:
prompt = hub.pull("rlm/rag-prompt")
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [176]:
prompt = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.
    Question: {question} 
    Context: {context} 
    Answer:
"""

prompt = ChatPromptTemplate.from_template(prompt)

In [178]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

# print(format_docs(docs))
rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

question = "What are capex plans of Kaynes technologies"
# question = "How was the result Anand Raj Limited in Q3 2024"
# question = "Who are the management team of Kaynes Technologies"
# question = "How is the order book or Kaynes Technologies"
output = rag_chain.invoke(question)
output

"Here are the relevant points about Kaynes Technology India Limited's capex plans:\n\n* Total capex over the next 5 years is estimated to be ₹ 80 billion.\n* Most of this capex will go towards the Wires and Cables business, with an expected asset turnover of 4x to 5x.\n* There is a specific mention of EHV cable capex, which is approximately ₹ 6-7 billion, planned for commissioning by the end of FY26.\n* Revenue potential from this EHV cable project is estimated with asset turns close to about 4x."