<a href="https://colab.research.google.com/github/dineshsereno-git/desktop-tutorial/blob/main/RAG_PDF_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import yaml
import os

# Function to load config (expects JSON or YAML only)
# Function to load config (tries JSON first, then YAML)
def load_config(config_path="config.json"):
    for path in ["config.json", "config.yaml"]:  # Prioritizes JSON, then YAML
        if os.path.exists(path):  # Check if the file exists
            config_path = path  # Set config_path to the found file
            break
    else:
        print("Config file not found. Using defaults.")
        return {"vector_db": "chromadb"}  # Default settings

    with open(config_path, "r") as file:
        return json.load(file) if config_path.endswith(".json") else yaml.safe_load(file)

# Load config
config = load_config()

# Required keys that may need API keys
KEY_REQUIRED = {"openai", "pinecone", "weaviate"}

# Assign config values with defaults
vector_db = config.get("vector_db", "chromadb")
vector_db_key = config.get("vector_db_key", None)
embedding_model = config.get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2")
embedding_model_key = config.get("embedding_model_key", None)
llm_model = config.get("llm_model", "gpt-3.5-turbo")
llm_model_key = config.get("llm_model_key", None)

data_source = config.get("data_source", "pdf")
pdf_folder = config.get("pdf_folder", "/content/drive/MyDrive/Colab Notebooks/data/pdfs")
text_folder = config.get("text_folder", "data/texts")
url_folder = config.get("url_folder", "data/urls")
history_output_folder = config.get("history_output_folder", "output/history")

# Check if vector_db requires a key but it's missing
if vector_db in KEY_REQUIRED and not vector_db_key:
    print(f"⚠️ Warning: {vector_db} requires an API key, but none provided!")

if llm_model in KEY_REQUIRED and not llm_model_key:
    print(f"⚠️ Warning: {llm_model} requires an API key, but none provided!")

# Print loaded settings
print("\n--- Loaded Configuration ---")
print(f"Vector Database: {vector_db} {'(Key Required)' if vector_db in KEY_REQUIRED else ''}")
print(f"Embedding Model: {embedding_model} {'(Key Required)' if embedding_model in KEY_REQUIRED else ''}")
print(f"LLM Model: {llm_model} {'(Key Required)' if llm_model in KEY_REQUIRED else ''}")
print(f"Data Source: {data_source}")
print(f"PDF Folder: {pdf_folder}")
print(f"Text Folder: {text_folder}")
print(f"URL Folder: {url_folder}")
print(f"History Output Folder: {history_output_folder}")

timestamp = __import__("datetime").datetime.now().strftime("%Y%m%d_%H%M%S")

print("\n ", timestamp)

# STEP 1 IS CLEAR - IT GATHERS THE CONFIG DETAILS
#Load config.json (fallback to config.yaml).

Config file not found. Using defaults.

--- Loaded Configuration ---
Vector Database: chromadb 
Embedding Model: sentence-transformers/all-MiniLM-L6-v2 
LLM Model: gpt-3.5-turbo 
Data Source: pdf
PDF Folder: /content/drive/MyDrive/Colab Notebooks/data/pdfs
Text Folder: data/texts
URL Folder: data/urls
History Output Folder: output/history

  20250406_030549


In [None]:
!pip install chromadb==0.4.15
#!pip install chromadb   will be latest ChromaDB version: 1.0.0

import chromadb
print(f"ChromaDB version: {chromadb.__version__}")



Collecting chromadb==0.4.15
  Downloading chromadb-0.4.15-py3-none-any.whl.metadata (7.2 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb==0.4.15)
  Downloading chroma_hnswlib-0.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb==0.4.15)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb==0.4.15)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb==0.4.15)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting pulsar-client>=3.1.0 (from chromadb==0.4.15)
  Downloading pulsar_client-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting onnxruntime>=1.14.1 (from chromadb==0.4.15)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-e

In [None]:



import chromadb

def clearFlushChromaDB():

    # Initialize the Chroma client
    chroma_client = chromadb.PersistentClient(path="chromadb_store")

    # Step 1: List existing collections
    collections = chroma_client.list_collections()
    print("Existing Collections:", collections)

    # Step 2: Specify which collections to delete (e.g., [1], [1, 2], etc.)
    collections_to_delete = [0,1,2]  # Adjust this based on the index you want to delete

    # Step 3: Delete the selected collections
    for idx in collections_to_delete:
        collections = chroma_client.list_collections()  # Re-fetch the list of collections
        if idx < len(collections):
            collection_name = collections[idx]
            print(f"Attempting to delete collection: {collection_name}")
            chroma_client.delete_collection(collection_name)
            print(f"Deleted collection: {collection_name}")
        else:
            print(f"Collection index {idx} is out of range. No deletion performed.")

    # Confirm all collections have been deleted
    remaining_collections = chroma_client.list_collections()
    print("Remaining Collections:", remaining_collections)

    return


#clearFlushChromaDB()  # uncomment this in order to clear the chromaDB , vector embeddings and rebuild it again



Existing Collections: []
Collection index 0 is out of range. No deletion performed.
Collection index 1 is out of range. No deletion performed.
Collection index 2 is out of range. No deletion performed.
Remaining Collections: []


In [None]:
'''
!pip install langchain pypdf  # Including pypdf as it's often a dependency for PyPDFLoader

!pip install -U langchain-community

!pip install pypdf
'''

#you need to access your google drive, so permit it... run this code
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

import os
from langchain.document_loaders import PyPDFLoader

def load_pdf_documents(pdf_folder="/content/drive/MyDrive/Colab Notebooks/data/pdfs"):
    """Loads all PDF files from the specified folder using PyPDFLoader and extracts text."""
    pdf_documents = {}

    # Iterate through each file in the folder
    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()  # Returns a list of Document objects

            # Extract text from each document and concatenate into a single string
            extracted_text = "\n".join([doc.page_content for doc in documents])

            pdf_documents[filename] = extracted_text  # Store text, not Document objects
            print(f"Loaded {len(documents)} pages from {filename}")

    print(f"✅ Total PDFs processed: {len(pdf_documents)}")
    return pdf_documents  # Dictionary {filename: "full text content"}

# Step 1: Extract text
extracted_texts = load_pdf_documents("/content/drive/MyDrive/Colab Notebooks/data/pdfs")




Loaded 3 pages from CMO-Pink-Sheet-March-2025.pdf
Loaded 216 pages from mcs2025.pdf
✅ Total PDFs processed: 2


Mounted at /content/drive


In [None]:
import os

print("Current working directory:", os.getcwd())
print("Contents of 'data/pdfs':", os.listdir("."))


Current working directory: /content
Contents of 'data/pdfs': ['.config', 'chromadb_store', 'drive', 'sample_data']


In [None]:
#!pip install langchain

# CHUNK IT

from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text(extracted_texts, chunk_size=1000, chunk_overlap=200):
    """Splits extracted text into chunks for better embedding."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunked_data = {}

    for filename, text in extracted_texts.items():
        chunks = text_splitter.split_text(text)
        chunked_data[filename] = chunks

    print(f"✅ Chunking completed. Processed {len(extracted_texts)} files.")
    return chunked_data  # Dictionary {filename: [chunks]}


# Step 1: Extract text (Assuming extracted_texts is already created)
chunked_data = chunk_text(extracted_texts)
print(f"Total PDFs processed: {len(chunked_data)}")

# STEP 3 THE DATA IS CHUNKED USING RecursiveCharacterTextSplitter, THIS IS CLEAR AND DONE
# - Splits the extracted text into smaller chunks for better processing.
# - Uses `RecursiveCharacterTextSplitter` from LangChain for efficient chunking.
# - Allows setting chunk size (default: 1000 characters) and overlap (default: 200 characters).
# - Processes each extracted PDF text and stores chunks in a dictionary.



✅ Chunking completed. Processed 2 files.
Total PDFs processed: 2


In [None]:
#!pip install chromadb langchain langchain_huggingface
# !pip install -U langchain-community
# CHROMADB EMBEDDING
#!pip install langchain_huggingface


import chromadb
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

from langchain_huggingface import HuggingFaceEmbeddings  # ✅ Correct import



def store_embeddings_in_chroma(chunked_data, persist_directory="chromadb_store"):
    """Embeds chunked text and stores it in ChromaDB."""
    # Initialize embedding model
    #embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(path=persist_directory)

    for filename, chunks in chunked_data.items():
        collection = chroma_client.get_or_create_collection(name=filename.replace(".pdf", ""))
        embeddings = embedding_model.embed_documents(chunks)

        for i, chunk in enumerate(chunks):
            collection.add(
                ids=[f"{filename}_{i}"],
                metadatas=[{"source": filename, "chunk_index": i}],
                documents=[chunk],
                embeddings=[embeddings[i]]
            )

    print(f"✅ Embeddings stored in ChromaDB at '{persist_directory}'.")
    return chroma_client


print("Functions for embedding >>")
# - `chromadb` → Manages the local vector database for efficient similarity search.
# - `langchain.vectorstores.Chroma` → Provides a LangChain wrapper for ChromaDB.
# - `langchain_huggingface.HuggingFaceEmbeddings` → Embeds text using a transformer model.
# - `PersistentClient` from `chromadb` → Ensures embeddings are stored persistently.


Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==1

In [None]:
import time
start_time = time.time()  # Record start time


chroma_client = store_embeddings_in_chroma(chunked_data)
collections = chroma_client.list_collections()
print(f"🗂️ Total collections in ChromaDB: {len(collections)}")



end_time = time.time()  # Record end time
elapsed = end_time - start_time
print(f"Time taken to store embeddings in ChromaDB: {elapsed:.2f} seconds")



# STEP 4 THE DATA IS EMBEDDED  CROMA HUGGING FACE  , THIS IS CLEAR AND DONE..

# - Initializes the `HuggingFaceEmbeddings` model to convert text chunks into vector embeddings.
# - Uses `chromadb.PersistentClient` to create or retrieve a persistent ChromaDB collection.
# - Iterates over chunked text data and:
#   - Embeds each chunk using the transformer model.
#   - Stores embeddings in ChromaDB with metadata (source file, chunk index).
# - Logs confirmation once embeddings are successfully stored.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embeddings stored in ChromaDB at 'chromadb_store'.
🗂️ Total collections in ChromaDB: 2
Time taken to store embeddings in ChromaDB: 188.67 seconds


In [None]:
import chromadb
from langchain_huggingface import HuggingFaceEmbeddings

def queryChromaDB(user_query, persist_directory="chromadb_store", top_k=3):
    """Searches ChromaDB for relevant text chunks based on user query."""
    # Load embedding model
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    query_embedding = embedding_model.embed_query(user_query)

    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(path=persist_directory)
    collections = chroma_client.list_collections()

    if not collections:
        print("⚠️ No collections found in ChromaDB. Ensure embeddings are stored first.")
        return None

    # Search across all collections
    results = []
    for collection_obj in collections:  # Iterate through Collection objects
        collection_name = collection_obj.name  # Access the name attribute
        collection = chroma_client.get_collection(name=collection_name)
        search_results = collection.query(query_embeddings=[query_embedding], n_results=top_k)

        for doc, metadata, distance in zip(search_results['documents'][0], search_results['metadatas'][0], search_results['distances'][0]):
            results.append((doc, metadata, distance))

    # Sort results by similarity (lower distance is better)
    results.sort(key=lambda x: x[2])  # Sort by distance (ascending)

    # Display top results
    print("\n🔍 Query Results:")
    for i, (text, metadata, distance) in enumerate(results[:top_k]):
        print(f"\nResult {i+1}:\nSource: {metadata['source']} (Chunk {metadata['chunk_index']})\nDistance: {distance:.4f}\n{text}\n")

    return results[:top_k]  # Return top matches


user_query = "Tell me something unique about commodities"
query_results = queryChromaDB(user_query, persist_directory="chromadb_store", top_k=3)
print(query_results)


🔍 Query Results:

Result 1:
Source: mcs2025.pdf (Chunk 5)
Distance: 0.9069
U.S. Geological Survey, 2025, Mineral commodity summaries 2025 (ver. 1.2, March 2025): U.S. Geological Survey, 212 p., 
https://doi.org/10.3133/mcs2025. 
Associated data for this publication: 
U.S. Geological Survey, 2025, Data release for mineral commodity summaries 2025: U.S. Geological Survey data release, 
https://doi.org/10.5066/P13XCP3R. 
ISBN  978-1-4113-4595-9
CONTENTS 
General: Page Page 
Introduction .................................................................... 3 
Figure 1—The Role of Nonfuel Mineral Commodities in 
the U.S. Economy ....................................................... 4 
Significant Events, Trends, and Issues .......................... 5 
Figure 2—2024 U.S. Net Import Reliance ..................... 7 
Figure 3—Leading Import Sources (2020–23) of  
Nonfuel Mineral Commodities .................................... 8 
Table 1—U.S. Mineral Industry Trends ........................

In [None]:
import textwrap

def format_query_results_for_llm(query_results, user_query, max_chars=1000):
    """
    Trims and formats the ChromaDB query results before sending to an LLM.
    Ensures the content is concise, relevant, and within token limits.

    Parameters:
    - query_results: List of (text, metadata, distance) tuples from ChromaDB.
    - user_query: The original user query string.
    - max_chars: Maximum length of retrieved text to pass to the LLM.

    Returns:
    - formatted_prompt: The final structured prompt for the LLM.
    """
    formatted_text = []
    total_length = 0

    for i, (text, metadata, distance) in enumerate(query_results):
        source = metadata.get("source", "Unknown Source")
        chunk_index = metadata.get("chunk_index", "N/A")

        # Trim the content but maintain meaningful sentences
        trimmed_text = textwrap.shorten(text, width=300, placeholder="...")

        # Ensure sentences are not cut off abruptly (extend to nearest period)
        if len(trimmed_text) < len(text):
            last_period = trimmed_text.rfind(".")
            if last_period != -1:
                trimmed_text = trimmed_text[: last_period + 1]

        entry = f"🔹 **Source:** {source} (Chunk {chunk_index})\n   {trimmed_text}"
        total_length += len(entry)

        if total_length > max_chars:
            break  # Stop if we exceed the limit

        formatted_text.append(entry)

    # Construct the final structured prompt
    formatted_prompt = (
        "You are an expert in finance and commodities. Based on the following retrieved information, "
        "provide a unique insight about commodities.\n\n"
        f"📌 **User Query:** \"{user_query}\"\n\n"
        "🔎 **Retrieved Information:**\n"
        + "\n\n".join(formatted_text) +
        "\n\n📝 **Task:** Summarize key insights from the retrieved information and provide a unique perspective about commodities."
    )

    return formatted_prompt

# Example usage:
user_query = "Tell me something unique about commodities"
formatted_prompt = format_query_results_for_llm(query_results, user_query)
print(formatted_prompt)


You are an expert in finance and commodities. Based on the following retrieved information, provide a unique insight about commodities.

📌 **User Query:** "Tell me something unique about commodities"

🔎 **Retrieved Information:**
🔹 **Source:** mcs2025.pdf (Chunk 5)
   U.S. Geological Survey, 2025, Mineral commodity summaries 2025 (ver. 1.2, March 2025): U.S. Geological Survey, 212 p., https://doi.org/10.3133/mcs2025. Associated data for this publication: U.S. Geological Survey, 2025, Data release for mineral commodity summaries 2025: U.S. Geological Survey...

🔹 **Source:** CMO-Pink-Sheet-March-2025.pdf (Chunk 8)
   Sugar, U.S. $/kg b/ 0.79 0.89 0.84 0.95 0.89 0.84 0.81 0.83 0.81 0.80 0.82 Sugar, World $/kg b/ 0.41 0.52 0.45 0.54 0.49 0.43 0.43 0.45 0.44 0.40 0.42 continued on next page Annual Averages Quarterly Averages March 4, 2025 Monthly Averages - 1 - http://www.worldbank.org/commodities World Bank...

📝 **Task:** Summarize key insights from the retrieved information and provide 

In [None]:
!pip install transformers accelerate




In [None]:
import pickle

# Assuming 'formatted_prompt' variable holds the output of format_query_results_for_llm

# Save the formatted prompt to a pickle file
output_file = "formatted_prompt.pkl"
with open(output_file, 'wb') as f:
    pickle.dump(formatted_prompt, f)

print(f"Formatted prompt saved to: {output_file}")

# --- Later in your code or in a different Colab session ---

# Retrieve the formatted prompt from the pickle file
loaded_prompt = None
input_file = "formatted_prompt.pkl"
try:
    with open(input_file, 'rb') as f:
        loaded_prompt = pickle.load(f)
    print(f"Formatted prompt loaded from: {input_file}")
except FileNotFoundError:
    print(f"Error: File not found at {input_file}")

# Now you can use 'loaded_prompt' as your formatted prompt
# For example:
# if loaded_prompt:
#     response = llm_pipeline(loaded_prompt, max_length=512, temperature=0.7, do_sample=True)
#     print("🤖 LLM Response:\n", response[0]["generated_text"])

Formatted prompt saved to: formatted_prompt.pkl
Formatted prompt loaded from: formatted_prompt.pkl


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load a smaller, open-source model suitable for Colab
MODEL_NAME = "google/flan-t5-base"

# Load the tokenizer & model (force CPU usage - adjust if using GPU)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, device_map="cpu", torch_dtype="auto")

# Create text generation pipeline (for T5 models, it's often 'text2text-generation')
llm_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Assuming you have run queryChromaDB and have query_results
# Assuming you have your user_query defined
formatted_prompt = format_query_results_for_llm(query_results, user_query)

# Generate response from LLM
response = llm_pipeline(formatted_prompt, max_length=512, temperature=0.7, do_sample=True)

# Print final output
print("🤖 LLM Response:\n", response[0]["generated_text"])

Device set to use cpu


🤖 LLM Response:
 The World Bank's  Global Value Indexes for the month of March is a reliable measure of the value of U.S. commodity and of the commodity markets from the year of its creation in the 1990s to the present.
