In [78]:
from langchain_community.document_loaders import PyPDFLoader
pdf_document = PyPDFLoader(file_path="Report.pdf")

In [79]:
documents = pdf_document.load()

In [80]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [81]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=".",
    chunk_size=100,
    chunk_overlap=10,
    length_function=len,
)

In [82]:
splitted_document = text_splitter.split_documents(documents)

In [83]:
from langchain_huggingface import HuggingFaceEmbeddings

In [84]:
hugingface_embedding_model = HuggingFaceEmbeddings(model = "all-MiniLM-L6-v2")

In [85]:
list_splitted_documents = [doc.page_content for doc in splitted_document]

In [86]:
embedded_documents =hugingface_embedding_model.embed_documents(list_splitted_documents)

In [87]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore import InMemoryDocstore

In [88]:
faiss_index_flat = faiss.IndexFlat(384)

In [89]:
vector_store = FAISS(
    embedding_function= hugingface_embedding_model,
    index = faiss_index_flat,
    docstore= InMemoryDocstore(),
    index_to_docstore_id= {}
)

In [90]:
vector_store.add_documents(splitted_document)

['11a289dd-1ac9-482b-a5d3-6201662890e8',
 '255cfb54-8a3c-4640-8fd7-9109627685cc',
 '84f064a5-3143-4429-a5f6-0acfb1547dd0',
 '1203c739-091f-4eac-92d9-27fbaf27f0f4',
 '52fd17c2-5fb1-4188-9d0d-3d58c52ce60c',
 '9e88495c-c672-4704-8930-f18e398c1c28',
 '697eeab9-81c0-47c5-adb6-252cfee57d5f',
 '2a73cd98-740d-4471-9248-4b9389b80430',
 '6151599a-73ad-4590-83e9-1686a19cc5ac',
 'a20d3023-0e56-46dc-94c7-9481a06143d5',
 '700f16e2-5faf-4395-bf01-95c535a80b44',
 '0061971a-d6a6-475a-bcc5-fd64c869ce84',
 '465a814a-6539-49f9-9343-0ad7b8c37bc1',
 '4911f125-cc07-49e1-8d15-e8205f415b1c',
 'f38bc6a1-4897-4726-a3ff-2b120d264108',
 '28b7e00f-6ebf-4b79-a479-c40992694950',
 '80d8d754-3762-4805-bbbb-aee4cc2bbb26',
 'bbff082f-d81d-466a-9273-98e6a14d9784',
 'efeef6be-d85a-41ec-85ea-a2d6956844be',
 'f7417722-cee6-4d56-b2b4-773f7fdcb12f',
 'c2811c15-b2ec-4455-9a6c-8edf0bd6be20',
 '8ce6bfa8-ffe5-481a-ab44-0c41140fe6ef',
 '10ee75c7-f0f2-4f65-a8e4-07563a088017',
 'c9aafd8b-6aaf-4d2b-b77e-20536a335039',
 '7556946d-84ee-

In [91]:
vector_store.save_local(folder_path="project_vectordb")

In [92]:
retriever = vector_store.as_retriever(search_kwargs={"k":3})

In [93]:
retriever.invoke("What is the name of the company?")

[Document(id='c028106c-aea6-4dca-8391-efc332feddc4', metadata={'producer': 'Adobe Acrobat Pro (64-bit) 24.2.20895', 'creator': 'Adobe Acrobat Pro (64-bit) 24.2.20895', 'creationdate': '2024-07-11T14:18:32+05:30', 'moddate': '2024-07-11T14:18:32+05:30', 'title': '', 'source': 'Report.pdf', 'total_pages': 212, 'page': 170, 'page_label': '171'}, page_content='.  The \nCompany is incorporated in Maharashtra, India and is listed on \nBombay Stock Exchange (BSE) and National Stock Exchange \n(NSE) in India'),
 Document(id='15add8e2-c3d9-493e-9c52-05fd1dac9252', metadata={'producer': 'Adobe Acrobat Pro (64-bit) 24.2.20895', 'creator': 'Adobe Acrobat Pro (64-bit) 24.2.20895', 'creationdate': '2024-07-11T14:18:32+05:30', 'moddate': '2024-07-11T14:18:32+05:30', 'title': '', 'source': 'Report.pdf', 'total_pages': 212, 'page': 195, 'page_label': '196'}, page_content='. (Stepdown Subsidiary)      \nDatamatics Foundation (Subsidiary)     \nDatamatics Information Solutions Limited (Subsidiary)     \n

In [94]:
from langchain_groq.chat_models import ChatGroq
grok_llm = ChatGroq(model="deepseek-r1-distill-llama-70b")

In [111]:
from langchain_core.prompts import ChatPromptTemplate
prompt_template = ChatPromptTemplate(
    [
        ("system","You are an assistant who is an expert in analysing the NSE listed company's anuual revenue report. Whenever the user asks any question, you have to answer him with the help of the {context} provided. If you do not have the context or answer, simply reply user that the question is out of your scope rather than giving irrelevant answers."),
        ("user","{query}")
    ])



In [96]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [97]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [112]:
rag_chain = (
    {"context": retriever | format_docs , "query": RunnablePassthrough()}
    | prompt_template
    | grok_llm
    | StrOutputParser()
)

In [None]:
response = rag_chain.invoke("what is  Section 135 of the Companies Act, 2013?")

In [152]:
format_docs(retriever.invoke("what is  Section 135 of the Companies Act, 2013?"))

'. The Companies Act, 2013 (“the Act”) and the rules made \nthereunder.\n2\n\n. The statement pursuant to section 102 of the Companies \nAct, 2013 as amended (Act) setting out material facts \nconcerning the business with respect to Item no\n\n.\n(iv) The Company has complied with the provisions of Section 186 of \nthe Act in respect of investments made'

In [150]:
response

'<think>\nOkay, so I\'m trying to figure out the CIN of DGSL. I remember that CIN stands for Corporate Identification Number, which is a unique identifier assigned to companies registered in India. It\'s usually a 21-digit number. \n\nLooking at the information provided, I see some numbers and terms. There\'s "5.28" and "Nil (P.Y. 50) units of 9." I\'m not sure what those represent, but they don\'t seem to be the CIN. Then there\'s a long number: 109931. That\'s only six digits, which is way shorter than the 21 digits a CIN should have. So that\'s probably not it either.\n\nI also see "UDIN: 24109931BKEIAJ3605." UDIN is a Unique Document Identification Number, used for identifying documents filed with the MCA in India. It\'s different from the CIN, so that\'s not what we\'re looking for.\n\nThe rest of the information includes the place, Mumbai, and the date, May 8, 2024. There\'s a mention of "Sr. Name CIN Subsidiary/ No provided." It seems like the CIN might not be listed here, or it