In [1]:
!pip install sentence-transformers pypdf langchain-community faiss-cpu langgraph==0.3.2 mistralai pandas langchain-huggingface



In [1]:
import os
import json
from typing import TypedDict, List
import pandas as pd
from langchain.vectorstores.faiss import FAISS
from langchain.docstore.document import Document
from mistralai import Mistral
from langgraph.graph import StateGraph, END
from langchain_huggingface import HuggingFaceEmbeddings

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter

In [2]:
os.environ["MISTRAL_API_KEY"] = "T5DdItPzE5WqcQrrOL7RLlpvJHQWoGHY"
api_key = os.environ["MISTRAL_API_KEY"]

In [3]:
# Load PDFs from a folder
loader = PyPDFDirectoryLoader("../../data/RealtedFiles/")
documents = loader.load()

# Split into smaller chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=50
)
docs = splitter.split_documents(documents)
len(docs)

187

In [10]:
for i, doc in enumerate(docs):
    print(doc)
    # file_name = docs[i].metadata.source.split("/")[-1].replace(".pdf", "")
    # print(f"Document {i+1} from file: {}")
    doc.metadata.update({"doc_id": f"RBI_Guideline_{i+1}"})

page_content='RBI/DBR/2015-16/18 
Master Direction DBR.AML.BC.No.81/14.01.001/2015-16        
 February 25, 2016 
(Updated as on August 14, 2025) 
(Updated as on June 12, 2025) 
(Updated as on November 06, 2024) 
(Updated as on January 04, 2024) 
(Updated as on October 17, 2023) 
(Updated as on May 04, 2023) 
(Updated as on April 28, 2023) 
 (Updated as on May 10, 2021) 
(Updated as on April 01, 2021) 
(Updated as on March 23, 2021) 
(Updated as on December 18, 2020) 
(Updated as on April 20, 2020) 
(Updated as on April 01, 2020) 
(Updated as on January 09, 2020) 
(Updated as on August 09, 2019) 
(Updated as on May 29, 2019) 
 
Master Direction - Know Your Customer (KYC) Direction, 2016 
Contents 
INTRODUCTION ..................................................................................................................... 3 
CHAPTER  I ............................................................................................................................. 4 
PRELIMINARY .......

In [11]:
docs[110].metadata

{'source': '..\\..\\data\\RealtedFiles\\RBI_Guideline.pdf',
 'page': 60,
 'doc_id': 'RBI_Guideline_111'}

In [12]:
# === MISTRAL CLIENT ===
model_name = "mistral-small-2506"

client = Mistral(api_key=api_key)

In [14]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [15]:
from re import search
vectorstore = FAISS.from_documents(docs, embeddings)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001A90203FC10>, search_kwargs={'k': 3})

In [16]:
from typing import TypedDict, List

def finance_rag_tool(query: str) -> str:
    """Simple RAG chain: retrieve docs & query model with context"""
    print("RAG tool called")

    # Retrieve relevant docs
    retrieved_docs = retriever.invoke(query)
    if not retrieved_docs:
        print("No docs found!")
        return "I'm sorry, I couldn't find any information related to that in DB."

    context = "\n\n".join(
        [doc.page_content for doc in retrieved_docs]
        )

    # Compose prompt with context + query
    prompt = f"""
    You are an expert RBI Banking Regulatory Assistant with deep knowledge of Indian banking regulations.

    Your task is to answer user questions strictly using the information retrieved from:
    - RBI circulars and master directions
    - RBI policy PDFs
    - Internal banking policy or operational manuals
    - KYC / AML / CFT guidelines
    - Annual reports of Indian banks or financial institutions

    You MUST follow these rules:
    1. Use ONLY the provided context to generate the answer.
    2. If the question is NOT related to Indian banking regulations, RBI guidelines, policy documents, KYC norms, product manuals, or annual reports, clearly respond:
    "I cannot answer this question as it is outside the scope of RBI and Indian banking regulatory documents."
    3. Do NOT use general knowledge or assumptions.
    4. Provide clear, factual, and compliance-oriented answers.
    5. Where applicable, mention relevant RBI circulars, master directions, or sections.
    6. Cite the source documents using the provided metadata (document name, page number, or section).
    7. If the context does not contain sufficient information, state that the information is not available in the retrieved documents.

    --------------------
    Retrieved Context:
    {context}
    --------------------

    User Question:
    {query}

    Answer (with citations):
    """

    response = client.chat.complete(
        model=model_name,
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    rag_tool_response = response.choices[0].message.content
    return rag_tool_response

In [19]:
def query_finance_rag_tool(user_query: str, with_system_prompt=True) -> str:
    # return retriever.invoke(query)
    

    # --- Initialize state ---

    messages = []
    if with_system_prompt:
      messages.append({
          "role": "system",
          "content": "You are a helpful assistant \
          expert in agriculture."
      })
      messages.append(
          {
              "role": "user",
              "content": user_query
          }
      )

    state = {"messages":messages}
    result = retriever.invoke(user_query)

    # Normalize all messages to dicts
    def normalize(msg):
        if isinstance(msg, dict):
            return msg
        return {
            "role": getattr(msg, "role", None),
            "content": getattr(msg, "content", None),
            "name": getattr(msg, "name", None),
            "tool_call_id": getattr(msg, "tool_call_id", None)
        }

    normalized_messages = [normalize(m) for m in result["messages"]]

    # Look for tool output first
    tool_messages = [
        msg for msg in normalized_messages if msg["role"] == "tool"
    ]

    if tool_messages:
        return tool_messages[-1]["content"]

    # Fall back to assistant answer
    assistant_messages = [
        msg for msg in normalized_messages if msg["role"] == "assistant"
    ]

    if assistant_messages:
        return assistant_messages[-1]["content"]

    return "No response was generated"

In [20]:
user_query = "What are RBI?"

print(query_finance_rag_tool(user_query))

TypeError: list indices must be integers or slices, not str