In [28]:
# pip install -U langchain_community faiss-cpu langchain-huggingface pymupdf tiktoken langchain-ollama python-dotenv

In [None]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")

load_dotenv()

print(os.environ['LANGSMITH_API_KEY'])

In [30]:
from langchain_community.document_loaders import PyMuPDFLoader
# pdf_loader = PyMuPDFLoader('./data/kaynes.pdf')
# docs = pdf_loader.load()
# doc = docs[0]

In [None]:
import os

docs = []
for root, dirs, files in os.walk('./data'):
    for file in files:
        if file.endswith('.pdf'):
            print('file to be added', file)
            pdf_loader = PyMuPDFLoader(os.path.join(root, file))
            pages = pdf_loader.load()
            docs.extend(pages)

len(docs)


### Document Splitter

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_documents(docs)
len(chunks)

In [33]:
import tiktoken
encoding = tiktoken.encoding_for_model('gpt-4o-mini')
# len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content))

### Document Vector Embeddings

In [None]:
from langchain_ollama import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")
single_vector = embeddings.embed_query("this is some text data")
len(single_vector)

In [None]:
index = faiss.IndexFlatL2(len(single_vector))
index.d

In [36]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

ids = vector_store.add_documents(documents=chunks)
# vector_store.index_to_docstore_id, len(ids)
# ids

In [24]:
# db_name = "q3 results"
# vector_store.save_local(db_name)

### Retrieval

In [None]:
question = "what is capex planned by kaynes"
vector_store.search(query=question, search_type="similarity", k=5) # k is the number of results to return

In [38]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 5,
                                                                        'fetch_k': 100,   
                                                                        'lambda_mult': 1})

# question = "how was the result of Anant Raj Limited in Q3 2024"
retrieved_docs = retriever.invoke(question)



### RAG with LLAMA 3.2 on OLLAMA

In [None]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

from langchain_ollama import ChatOllama
model = ChatOllama(model="llama3.2", base_url="http://localhost:11434")
model.invoke("hi")

In [None]:
prompt = hub.pull("rlm/rag-prompt")
prompt

In [None]:
prompt = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.
    Question: {question} 
    Context: {context} 
    Answer:
"""

prompt = ChatPromptTemplate.from_template(prompt)
prompt

In [54]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

# print(format_docs(docs))
rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# question = "What are capex plans of Kaynes technologies?"
# question = "How was the result Anant Raj Limited in Q2 2025"
# question = "How was the result Anant Raj Limited"
# question = "Who are the management team of Kaynes Technologies"
# question = "How is the order book or Kaynes Technologies"
# question = "How is the order book or Polycab"
# question = "How to write LLM prompts"
output = rag_chain.invoke(question)
output

"I don't have any information about writing LLM prompts from the provided context. The text appears to be a transcript of an earnings conference call for companies such as Kaynes Technology India Limited and Polycab India Limited. It does not contain any relevant information on writing LLM prompts."