In [30]:

# Imports
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Jupyter-specific imports
from IPython.display import display, Markdown

# Set environment variable for protobuf
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [31]:
import pdfplumber

local_path = "Finance.pdf"
if local_path:
    with pdfplumber.open(local_path) as pdf:
        data = ""
        for page in pdf.pages:
            data += page.extract_text()
    print(f"PDF loaded successfully: {local_path}")
    print(data)  # To see the extracted text
else:
    print("Upload a PDF file")


PDF loaded successfully: Finance.pdf
INFOSYS LIMITED AND SUBSIDIARIES
Condensed Consolidated Financial Statements under
Indian Accounting Standards (Ind AS)
for the three months and year ended March 31, 2024
Index Page No.
Condensed Consolidated Balance Sheet ………………………………………………………………………………………………1 ……………………..
Condensed Consolidated Statement of Profit and Loss ………………………………………………………………………………2……………………………………..
Condensed Consolidated Statement of Changes in Equity ……………………………………………………………………………3………………………………………..
Condensed Consolidated Statement of Cash Flows …………………………………………………………………………………5…………………………………..
Overview and Notes to the Interim Condensed Consolidated Financial Statements
1. Overview
1.1 Company overview ……………………………………………………………………………………………………………7………..
1.2 Basis of preparation of financial statements …………………………………………………………………………………7…………………………………..
1.3 Basis of consolidation …………………………………………………………………………………………………………7…………..
1.4 Use of estimates and judgments …………………………………………………………………………

In [32]:
from langchain.docstore.document import Document
# Convert the extracted text into a list of Document objects
documents = [Document(page_content=data)]

# Initialize the text splitter with the desired chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split the documents into chunks
chunks = text_splitter.split_documents(documents)

# Print the number of chunks
print(f"Text split into {len(chunks)} chunks")

# Optionally, print a sample chunk
print(f"Sample chunk: {chunks[0].page_content}")

Text split into 191 chunks
Sample chunk: INFOSYS LIMITED AND SUBSIDIARIES
Condensed Consolidated Financial Statements under
Indian Accounting Standards (Ind AS)
for the three months and year ended March 31, 2024
Index Page No.
Condensed Consolidated Balance Sheet ………………………………………………………………………………………………1 ……………………..
Condensed Consolidated Statement of Profit and Loss ………………………………………………………………………………2……………………………………..
Condensed Consolidated Statement of Changes in Equity ……………………………………………………………………………3………………………………………..
Condensed Consolidated Statement of Cash Flows …………………………………………………………………………………5…………………………………..
Overview and Notes to the Interim Condensed Consolidated Financial Statements
1. Overview
1.1 Company overview ……………………………………………………………………………………………………………7………..
1.2 Basis of preparation of financial statements …………………………………………………………………………………7…………………………………..
1.3 Basis of consolidation …………………………………………………………………………………………………………7…………..
1.4 Use of estimates and judgments ………………………………………………………………

In [42]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Specify the model name as a string
model_name = "sentence-transformers/all-MiniLM-L6-v2"

# Create Hugging Face embeddings
embedding = HuggingFaceEmbeddings(model_name=model_name)

# Create the vector database using Chroma
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    collection_name="local-rag"
)

print("Vector database created successfully")


Vector database created successfully


In [54]:
local_model = "llama3.2"  # Replace with an actual Ollama-compatible model
llm = ChatOllama(model=local_model)


In [55]:
# Query prompt template
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

# Set up retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

In [56]:
# RAG prompt template
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [57]:
prompt = ChatPromptTemplate.from_template(template)
# Create chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [58]:
def chat_with_pdf(question):
    """
    Chat with the PDF using the RAG chain.
    """
    return display(Markdown(chain.invoke(question)))

In [59]:
chat_with_pdf("What is the main idea of this document?")

The main idea of this document appears to be a set of notes and explanations that accompany an audited condensed consolidated interim financial statement for Infosys, likely highlighting changes in accounting policies or specific rules related to provisions, contingencies, derivatives, and other financial matters. The document is intended to provide additional context and clarity on the financial statements, particularly with regards to amendments to Indian Accounting Standards (Ind AS) and other regulatory requirements.