## Data preparation

In [None]:
# Installing libraries
!pip install PyMuPDF langchain

In [None]:
# Importing libraries
import os
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# 1. Create a folder for sample financial PDFs
os.makedirs("financial_reports", exist_ok=True)

In [None]:
# function for reading pdf
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [None]:
# extracting from all pdfs
documents = []
pdf_folder = "financial_reports"
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        path = os.path.join(pdf_folder, pdf_file)           # creates full path
        text = extract_text_from_pdf(path)
        documents.append({"file_name": pdf_file, "text": text})

print(f"Extracted text from {len(documents)} PDFs.")


In [None]:
(len(documents[0]))

In [None]:
# Chunk the text into semantic chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)


In [None]:
# splitting the text

all_chunks = []
for doc in documents:
    chunks = text_splitter.split_text(doc["text"])
    for i, chunk in enumerate(chunks):
        all_chunks.append({
            "source_file": doc["file_name"],
            "chunk_id": i,
            "text": chunk
        })

print(f"Total semantic chunks created: {len(all_chunks)}")


In [None]:
all_chunks[0]

## embedding and vector db

In [None]:
# installing libraries
from chromadb.config import Settings
import chromadb
from sentence_transformers import SentenceTransformer

In [None]:
# 1. Initialize ChromaDB
# Step 1: Create a persistent client
client = chromadb.PersistentClient(path="chroma_db")

# Create or get a collection (like a table in SQL)
collection = client.get_or_create_collection(name="financial_reports")


In [None]:
# 2. Initialize embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
# 3. Embed and store chunks with unique IDs
for chunk in all_chunks:
    # Create a unique ID combining file name + chunk number
    unique_id = f"{chunk['source_file']}_{chunk['chunk_id']}"

    embedding = embed_model.encode(chunk["text"]).tolist()
    
    collection.add(
        ids=[unique_id],   # <-- REQUIRED UNIQUE ID
        documents=[chunk["text"]],
        metadatas=[{"source_file": chunk["source_file"], "chunk_id": chunk["chunk_id"]}],
        embeddings=[embedding]
    )


## local llm mistral via ollama

In [None]:
#pip install requests


In [None]:
import requests
import json

# API endpoint
url = "http://localhost:11434/api/generate"

query = "what day is today"
# Payload with prompt
payload = {
    "model": "mistral",
    "prompt": query
}

# Send request
response = requests.post(url, json=payload, stream=True)

# Read stream response
for line in response.iter_lines():
    if line:
        data = json.loads(line.decode("utf-8"))
        if "response" in data:
            print(data["response"], end="")


## adding embedding model with vector db and user query for semantic search

In [None]:
from sentence_transformers import SentenceTransformer
import chromadb

# 1. Load embedding model (local, free, open-source)
#embed_model

# 2. Connect to Chroma (vector DB)
client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_collection("financial_reports")

# 3. Encode user query
query = "What was Google's total revenue in 2023?"
query_embedding = embed_model.encode(query).tolist()  # convert to list for chroma

# 4. Search in vector DB
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3  # top-k chunks
)
results

## retriver

In [None]:
from langchain.vectorstores import Chroma

# Use the same embedding model that was used while inserting docs
embedding_model = embed_model

# Load the existing Chroma collection
retriever = Chroma(
    persist_directory="chroma_db",
    collection_name="financial_reports",   # ✅ your collection name
    embedding_function=embedding_model
).as_retriever(search_kwargs={"k": 3})


In [None]:
# pip install langchain_community

## Now RAG pipeline

In [None]:
from langchain_community.chat_models import ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA


In [None]:
# llm defination
llm = ChatOllama(model="mistral")


In [None]:
# givng prompt

prompt = ChatPromptTemplate.from_template("""
You are an expert financial assistant. 
Use the following context from Google's annual reports to answer the question.
If the answer is not in the context, say you don't know.

Context:
{context}

Question:
{question}

""")


In [None]:
# defining the chain from langchain for proper retrivel
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)


In [None]:
# testing the chain
query = "total 2024 revenue in billions?"
result = qa_chain.run(query)


In [None]:
print(result)

In [None]:
# add the source and other details to the answer with python function

# Function to query with provenance
def rag_query_with_sources(question):
    # Step 1: Retrieve relevant documents (chunks)
    docs = retriever.get_relevant_documents(question)

    # Step 2: Build context with source info
    context_with_sources = ""
    for i, doc in enumerate(docs):
        source_name = doc.metadata.get("source_file", "unknown")
        page_number = doc.metadata.get("chunk_id", "unknown")
        snippet = doc.page_content[:100]  # first 300 chars
        context_with_sources += f"[Source: {source_name}, Page: {page_number}] {snippet}\n\n"

    # Step 3: final prompt
    final_prompt = f"""
    You are an expert financial assistant. 
    Use the following context to answer the question.
   

    Context:
    {context_with_sources}

    Question:
    {question}

    Answer:
    """

    # Step 4: Query Mistral via Ollama
    response = llm.invoke(final_prompt)

    # Step 5: Return both answer and sources (structured)
    return {
        "answer": response.content,
        "sources": [{"source": doc.metadata.get("source_file"),
                     "chunk_no": doc.metadata.get("chunk_id"),
                     "snippet": doc.page_content[:200]} for doc in docs]
    }


In [None]:
## testing this function
result = rag_query_with_sources("What was Google's revenue in 2023 compared to 2022?")



In [None]:
# print(result["answer"])
print(result["sources"])

## Adding another dataset in db

In [None]:


import pandas as pd

# Load CSV
csv_file = "books./GOOGL-2013_2023.csv"
df = pd.read_csv(csv_file)

# Inspect first few rows
df.head()


In [None]:
# convert csv into shunks

all_stock_chunks = []

for idx, row in df.iterrows():
    text_chunk = f"Date: {row['Date']}, Open: {row['Open']}, High: {row['High']}, Low: {row['Low']}, Close: {row['Close']}, Volume: {row['Volume']}"
    
    all_stock_chunks.append({
        "text": text_chunk,
        "source_file": "google_stock_prices.csv",
        "chunk_id": idx
    })


In [None]:
# creating collection
stock_collection = client.create_collection(name="stock_prices")

# adding chunks
for chunk in all_stock_chunks:
    unique_id = f"{chunk['source_file']}_{chunk['chunk_id']}"
    
    embedding = embed_model.encode(chunk["text"]).tolist()
    
    stock_collection.add(
        ids=[unique_id],
        documents=[chunk["text"]],
        metadatas=[{"source_file": chunk["source_file"], "chunk_id": chunk["chunk_id"]}],
        embeddings=[embedding]
    )


In [None]:
# List all collections in the Chroma client
collections = client.list_collections()
print("Available collections:")
for col in collections:
    print("-", col.name)


In [None]:
# create retriver for this collection

from langchain.vectorstores import Chroma

# Use the same embedding model that was used while inserting docs
embedding_model = embed_model

# Load the existing Chroma collection
retriever_stock = Chroma(
    persist_directory="chroma_db",
    collection_name="stock_prices",   # ✅ collection name
    embedding_function=embedding_model
).as_retriever(search_kwargs={"k": 3})


## Planner

In [None]:
def simple_planner(query: str) -> str:
    query_lower = query.lower()
    
    # Keywords for annual reports (financial info)
    financial_keywords = ["revenue", "profit", "expenses", "growth", "financial", "earnings", "income"]
    
    # Keywords for stock prices (CSV dataset)
    stock_keywords = ["stock", "share price", "close price", "open price", "high", "low", "volume"]
    
    # Check if the query matches financial keywords
    if any(word in query_lower for word in financial_keywords):
        return "annual_reports"
    
    # Check if the query matches stock keywords
    elif any(word in query_lower for word in stock_keywords):
        return "stock_prices"
    
    # Fallback if unsure
    else:
        return "none"

In [None]:
def get_retriever_docs(user_query, retriever_reports, retriever_stock):
# --- Planner ---
    collection_choice = simple_planner(user_query)
    
    # --- Retriever ---
    if collection_choice == "annual_reports":
        docs = retriever_reports.get_relevant_documents(user_query)
    elif collection_choice == "stock_prices":
        docs = retriever_stock.get_relevant_documents(user_query)
    else:  # fallback to combine both
        docs_reports = retriever_reports.get_relevant_documents(user_query)
        docs_stock = retriever_stock.get_relevant_documents(user_query)
        docs = docs_reports + docs_stock
    
    return docs


In [None]:
def build_llm_prompt(docs, user_query):
 # --- Step 1: Build context with sources ---
    context_with_sources = ""
    for doc in docs:
        source_name = doc.metadata.get("source_file", "unknown")
        chunk_id = doc.metadata.get("chunk_id", "unknown")
        snippet = doc.page_content[:300]  # first 300 characters
        context_with_sources += f"[Source: {source_name}, Chunk: {chunk_id}] {snippet}\n\n"

    # --- Step 2: Build prompt ---
    prompt = f"""
You are an expert financial assistant. 
Use the following context to answer the question.

Context:
{context_with_sources}

Question:
{user_query}

Answer:
"""
    return prompt

In [None]:
user_query = "Revenue in billion in 2023"
docs = get_retriever_docs(user_query, retriever, retriever_stock)

prompt = build_llm_prompt(docs, user_query)

In [None]:
# using llm mistral for this. with planner retriver and context
response = llm.invoke(prompt)
answer = response.content
print(answer)


## logging to file

In [None]:
import datetime

def log_llm_run(user_query, answer, docs, plan, log_file="llm_logs.txt"):
# Step 1: Get current timestamp
    current_time = datetime.datetime.now()

    # Step 2: Start building the log string
    log_entry = f"--- LLM Run: {current_time} ---\n"
    log_entry += f"User Query: {user_query}\n"
    log_entry += f"Collection Used: {plan}\n"
    log_entry += f"Answer: {answer}\n"
    log_entry += "Sources Used:\n"

    # Step 3: Add each document's source info
    for doc in docs:
        source_file = doc.metadata.get("source_file", "unknown")
        chunk_id = doc.metadata.get("chunk_id", "unknown")
        log_entry += f"  - Source File: {source_file}, Chunk ID: {chunk_id}\n"

    log_entry += "\n\n"  # blank line at end for separation

    # Step 4: Write log to the file
    with open(log_file, "a") as file:
        file.write(log_entry)

    # Step 5: Return the log string so it can be printed or used immediately
    return log_entry


## Now complete run with planner retriver logfiles and llm synthesizer

In [None]:
def run(user_query):
    plan = simple_planner(user_query)
    docs = get_retriever_docs(user_query, retriever, retriever_stock)
    prompt = build_llm_prompt(docs, user_query)
    response = llm.invoke(prompt)
    answer = response.content
    if not is_high_confidence(docs, threshold=0.6):
        approved = human_review(answer)
        if not approved:
            answer = "[ESCALATED] Awaiting human input."
    log = log_llm_run(user_query,answer,docs,plan)
    
    return answer,log

In [None]:
a = input("Enter your query")
answer,log = run(a)

In [None]:
print(f"Response:{answer}\n\n")
print(f"log file:{log}")

In [None]:
## Building confidence thershold function

def is_high_confidence(docs, threshold=0.6):
    if not docs:
        return False
    # Assuming each doc.metadata has a 'similarity' score (0-1)
    top_score = max(doc.metadata.get("similarity", 0) for doc in docs)
    return top_score >= threshold

In [None]:

## building for human review 

def human_review(answer):
    print("\n--- HUMAN REVIEW REQUIRED ---")
    print("Suggested answer:\n", answer)
    confirm = input("\nDo you approve this answer? (y/n): ").strip().lower()
    if confirm == "y":
        print("Answer approved.")
        return True
    else:
        print("Answer escalated!")
        return False


In [None]:
from langchain_huggingface import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="google/flan-t5-large",
    huggingfacehub_api_token=HF_API_KEY
)

prompt = "Summarize the following text: Google made $XYZ revenue in 2023."

# Call directly
answer = llm(prompt)
print(answer)


In [None]:
pip install -U langchain langchain-huggingface huggingface_hub transformers sentence-transformers


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

prompt = "Summarize: Google revenue in 2023."

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=200)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(answer)


In [None]:

from langchain_huggingface import HuggingFaceEndpoint
llm = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/google/flan-t5-large",
    huggingfacehub_api_token=HF_API_KEY)


prompt = "Summarize: Google revenue in 2023."

# If HuggingFaceEndpoint
answer = llm(prompt)
print(answer)

# If HuggingFaceHub
answer = llm(prompt)
print(answer)


In [None]:
from langchain_huggingface import HuggingFaceEndpoint

# Hugging Face API key"

# Create the Hugging Face endpoint with the API key passed directly
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-Nemo-Instruct-2407", huggingfacehub_api_token=api_key)

# Generate text
output = llm.predict("Write a short poem on India")
print(output)


In [None]:
from langchain.llms import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/mistralai/Mistral-Nemo-Instruct-2407",
    huggingfacehub_api_token=api_key,
    task="text-generation"
)

print(llm("Write a short poem on India"))


In [None]:
pip install huggingface_hub==0.13.4


In [None]:
from langchain_community.llms import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-Nemo-Instruct-2407",
    huggingfacehub_api_token=api_key,
    task="text-generation",
)

print(llm("Write a short poem on India"))


In [None]:
!pip install huggingface_hub==0.15.1



In [None]:
from langchain_community.llms import HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-Nemo-Instruct-2407",
    huggingfacehub_api_token=api_key,
    task="text-generation",
)

print(llm("Write a short poem on India"))


In [None]:
pip uninstall huggingface_hub langchain langchain-huggingface -y


In [None]:
!pip install huggingface_hub==0.15.1
!pip install langchain==0.0.174
!pip install langchain_huggingface==0.3.1


In [None]:
pip install langchain-huggingface
