# Sample code to use LLM and RAG to read a Document (PDF, Excel, Word, CSV, HTML) and provide inputs from it

In [None]:
!pip install langchain langchain-openai faiss-cpu python-dotenv

# SETUP & ENVIRONMENT

In [35]:
#  Import necessary libraries
import os
from dotenv import load_dotenv  # For loading your OpenAI key securely
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA


# Load and Read PDF

In [36]:
#  Provide the path to your PDF document. Here I have uploaded SCB annual report ( PDF) is Google Drive and shared the path
pdf_path = "/content/drive/MyDrive/GLEAN POC DATA/SCB 2023 Annual Report/standard-chartered-plc-full-year-2023-report.pdf"

# Load the PDF using PyPDFLoader. If you are using other file type then refer the code snippets below.
loader = PyPDFLoader(pdf_path)
documents = loader.load()
# We have PyPDFLoader as we have used a simple PDF as input in the code example. For PDF which has many tables its recommended to use PDFPlumberLoader instead. For PDF with scanned images to be extracted you can use UnstructuredPDFLoader.


In [37]:
##If using DOC/DOCX (Word) use UnstructuredWordDocumentLoader or Docx2txtLoader to extract content
#
#from langchain.document_loaders import UnstructuredWordDocumentLoader
#loader = UnstructuredWordDocumentLoader("file.docx")
#documents = loader.load()
#
##If using XLS/XLSX (Excel) use PandasExcelLoader or UnstructuredExcelLoader  to extract content
#
#from langchain.document_loaders import UnstructuredExcelLoader
#loader = UnstructuredExcelLoader("file.xlsx")
#documents = loader.load()
#
##If using HTML use UnstructuredHTMLLoader to extract content
#
#from langchain.document_loaders import UnstructuredHTMLLoader
#loader = UnstructuredHTMLLoader("file.html")
#documents = loader.load()
#
##If using CSV use CSVLoader or PandasCSVLoader to extract content
#from langchain.document_loaders import UnstructuredHTMLLoader
#loader = UnstructuredHTMLLoader("file.html")
#documents = loader.load()
#
##If using JSON use JSONLoader to extract content
#from langchain.document_loaders import JSONLoader
#loader = JSONLoader(
#    file_path="file.json",
#    jq_schema=".records[].summary",
#    text_content=True
#)
#documents = loader.load()
#
##If using Website use the below code to extract content
#
#from langchain.document_loaders import WebBaseLoader
#loader = WebBaseLoader("https://example.com")
#documents = loader.load()

In [38]:
# Load your OpenAI API key from a .env file
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
openai_api_key = os.environ.get("OPENAI_API_KEY")


# Chunk the Text

In [39]:
# Split large documents into small readable chunks
# Useful for the LLM to process smaller bites of data
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,       # Number of characters per chunk
    chunk_overlap=100     # Overlap helps maintain context across chunks
)

chunks = text_splitter.split_documents(documents)
print(f"✅ Number of chunks created: {len(chunks)}")
print(f"📄 First chunk preview:\n{chunks[0].page_content}")

# We have used RecursiveCharacterTextSplitter in this example. Howeever there are alternate chunking splitters based on user requirements that you can leverage
#| Splitter Class                          | Description                                           | Use Case                                           |
#| --------------------------------------- | ----------------------------------------------------- | -------------------------------------------------- |
#| `RecursiveCharacterTextSplitter`        | Splits at paragraphs → sentences → words → characters | Best for generic documents (PDFs, policies, books) |
#| `CharacterTextSplitter`                 | Splits at fixed character limits (naive)              | Simple logs, structured text                       |
#| `TokenTextSplitter`                     | Splits by token count (uses tokenizer like tiktoken)  | Precise control for GPT-3.5/4                      |
#| `SentenceTransformersTokenTextSplitter` | Aware of sentence boundaries + tokens                 | Best for multilingual and NLP-heavy documents      |
#| `MarkdownHeaderTextSplitter`            | Splits based on Markdown headers                      | Technical docs, blog posts, notebooks              |
#| `HTMLHeaderTextSplitter`                | Splits based on HTML tags                             | Websites, web-scraped data                         |
#| `Language` Splitters                    | Code-aware (Python, JS, etc.)                         | Splits by function/class — great for code docs     |

✅ Number of chunks created: 5004
📄 First chunk preview:
Annual Report 2023
[[Connecting 
the world’s 
most dynamic 
markets]]


# Generate Embeddings (Convert Text to Math)

In [40]:
# Convert text chunks into numerical vectors
# These embeddings help match relevant chunks to your question
embeddings = OpenAIEmbeddings(api_key=openai_api_key)


# Create a Vector Store

In [41]:
# Store the vectorized chunks in FAISS (in-memory vector database)
vectorstore = FAISS.from_documents(chunks, embeddings)
# We have used FAISS opens ource vector DB here. There are alternate solutions you can try like Chroma, Weaviate, Qdrant, Milvus

# Setup the LLM

In [42]:
# Load OpenAI LLM (GPT-4 Turbo) with low temperature for accurate answers. Provide the LLM of your choise here
llm = ChatOpenAI(
    model="gpt-4-turbo",
    temperature=0.3,
    api_key=openai_api_key
)

# Setup RetrievalQA Chain

In [43]:
# Connect the retriever (search engine) and the LLM
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Create a Retrieval-Augmented Generation (RAG) chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# Ask Questions!

In [44]:
# 🧪 Ask questions to your agent
def ask_rag_agent(query: str):
    print(f"\n🧑‍🎓 User Query: {query}")
    result = rag_chain.invoke({"query": query})

    print("\n✅ Answer:")
    print(result["result"])

    print("\n📚 Retrieved Contexts:")
    for doc in result["source_documents"]:
        print("-->", doc.page_content.strip()[:200], "...")


In [45]:
# Test: Ask a question
ask_rag_agent("What was the profit for SCB in 2023?")



🧑‍🎓 User Query: What was the profit for SCB in 2023?

✅ Answer:
The profit before taxation for Standard Chartered Bank (SCB) in 2023 was $5,093 million.

📚 Retrieved Contexts:
--> revenue growth in 2023.
Additionally, we made significant progress in our advisory 
business with the launch of SC Wealth Select in 14 markets. 
SC Wealth Select aims to bring a portfolio approach to  ...
--> SCB 2021 baseline
7. 3%
11.8%
2021
30%
20%
25%
15%
5%
0%
-5%
-10%
-20%
-25%
Alignment delta (against IMO trajectory %)
22 23 24 25 26 27 28 29 2030 Year
-11.8%
Progress in the year
In 2022, the Group  ...
--> 490
Standard Chartered – Annual Report 2023
Supplementary information Supplementary financial information
Five-year summary
2023 
$million
2022 
$million
2021 
$million
2020 
$million
2019
$million
Op ...


# Happy Learning