<a href="https://colab.research.google.com/github/frans-nekongo/docs/blob/main/RAG_OldMutual_gemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install system dependencies for OCR
!apt-get update && apt-get install -y poppler-utils tesseract-ocr

# Install Python libraries
# pytesseract, pillow, pdf2image: for OCR
# pypdf: for PDF text extraction
# google-generativeai: for generating embeddings with Gemini
# supabase: for interacting with your Supabase database
# tqdm: for displaying progress bars
# langchain-text-splitters: dedicated package for text splitting
!pip install -q pytesseract pillow pdf2image pypdf tqdm \
               google-generativeai supabase==2.6.0 \
               langchain-community langchain langchain-text-splitters

# === Part 0: Mount Google Drive and Define Data Directory ===
print("Mounting Google Drive...")
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted.")

# Define your base data directory on Google Drive
DATA_ROOT_DIR = "/content/drive/MyDrive/RAGBook"
print(f"Using data directory: {DATA_ROOT_DIR}")

import os
if not os.path.exists(DATA_ROOT_DIR):
    print(f"Warning: The specified data directory does not exist: {DATA_ROOT_DIR}. Please create it.")
else:
    print(f"Contents of {DATA_ROOT_DIR}:")
    for item in os.listdir(DATA_ROOT_DIR):
        print(f"- {item}")

# === Part 1: Imports ===
import json
import google.generativeai as genai
from supabase import create_client
# Updated import to use the dedicated package
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from pdf2image import convert_from_path
import pytesseract
from tqdm import tqdm

# === Part 3: Load and Process the Book (with OCR integration) ===

def extract_text_from_pdf_with_ocr(pdf_path, dpi=300):
    """
    Extracts text from a PDF, including text from images using OCR.
    Returns: A list of LangChain Document objects.
    """
    print(f"Extracting text from {pdf_path} with OCR...")
    documents = []
    try:
        images = convert_from_path(pdf_path, dpi=dpi)
        print(f"Converted {len(images)} pages to images.")

        for i, img in enumerate(images):
            print(f"Processing page {i+1}...")
            try:
                text = pytesseract.image_to_string(img)
                doc = Document(page_content=text, metadata={"source": pdf_path, "page": i})
                documents.append(doc)
            except Exception as ocr_error:
                print(f"Error processing page {i+1} with OCR: {ocr_error}")
                doc = Document(page_content="", metadata={"source": pdf_path, "page": i})
                documents.append(doc)

        print("Text extraction complete.")
        return documents
    except Exception as e:
        print(f"Error converting PDF to images or during OCR: {e}")
        return []

# Specify the name of the PDF file
pdf_filename = 'OMP.pdf'
pdf_path = os.path.join(DATA_ROOT_DIR, pdf_filename)

documents = []
if os.path.exists(pdf_path):
    documents = extract_text_from_pdf_with_ocr(pdf_path)
    if not documents:
        print(f"Could not extract text from {pdf_path}. Please check the file and try again.")
    else:
        print(f"Successfully extracted text from {len(documents)} pages.")
else:
    print(f"PDF file not found at {pdf_path}")

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
splits = text_splitter.split_documents(documents)
print(f"Created {len(splits)} text chunks after splitting.")

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [1 InRelease 14.2 kB/129 k                                                                               Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [1 InRelease 14.2 kB/129 k0% [Connecting to archive.ubuntu.com (91.189.91.81)] [1 InRelease 31.5 kB/129 k                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [1 InRelease 31.5 kB/129 k0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to cloud.r-pro0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to cloud.r-pro                                                                               Hit:4 https://ppa.l

In [None]:
# Install the new Google GenAI SDK and upgrade Supabase to resolve dependency conflicts
!pip install -U google-genai supabase

Collecting supabase
  Using cached supabase-2.24.0-py3-none-any.whl.metadata (4.6 kB)
Collecting httpx<1.0.0,>=0.28.1 (from google-genai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting websockets<15.1.0,>=13.0.0 (from google-genai)
  Using cached websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting realtime==2.24.0 (from supabase)
  Using cached realtime-2.24.0-py3-none-any.whl.metadata (7.0 kB)
Collecting storage3==2.24.0 (from supabase)
  Using cached storage3-2.24.0-py3-none-any.whl.metadata (2.1 kB)
Collecting postgrest==2.24.0 (from supabase)
  Using cached postgrest-2.24.0-py3-none-any.whl.metadata (3.4 kB)
Using cached supabase-2.24.0-py3-none-any.whl (16 kB)
Using cached postgrest-2.24.0-py3-none-any.whl (21 kB)
Using cached realtime-2.24.0-py3-none-any.whl (22 kB)
Using cached storage3-2.24.0-py3-none-any.whl (19 kB)
Using cached httpx-0.28.1-py3-none-any.whl 

In [None]:
# === Part 4: Configure Google Gemini and Supabase for Embeddings and Upload ===

from google.colab import userdata
import json
from tqdm import tqdm
# Import the new SDK
from google import genai
from google.genai import types
from supabase import create_client

# Retrieve API keys and URL from Colab Secrets
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
SUPABASE_URL_SECRET = userdata.get('SUPABASE_URL')
SUPABASE_SERVICE_ROLE_KEY_SECRET = userdata.get('SUPABASE_SERVICE_ROLE_KEY')

# Validate secrets
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found in Colab Secrets. Please add it.")
if not SUPABASE_URL_SECRET:
    raise ValueError("SUPABASE_URL not found in Colab Secrets. Please add it.")
if not SUPABASE_SERVICE_ROLE_KEY_SECRET:
    raise ValueError("SUPABASE_SERVICE_ROLE_KEY not found in Colab Secrets. Please add it.")

# Configure the new GenAI Client
client = genai.Client(api_key=GOOGLE_API_KEY)

# Configure Supabase
supabase = create_client(SUPABASE_URL_SECRET, SUPABASE_SERVICE_ROLE_KEY_SECRET)

# Configure embedding model and dimensions
EMBED_MODEL = "gemini-embedding-001"
EMBED_DIMS = 3072
BATCH_SIZE = 100

# Target Table Name
TABLE_NAME = "oldMutualDocs"

# Prepare records for insertion from LangChain Document 'splits'
print("Preparing records for insertion into Supabase...")
records = []
# Ensure 'splits' is available from previous cells
if 'splits' not in locals():
    print("Error: 'splits' variable not found. Please run the previous cells to process the PDF.")
    records = []
else:
    for d in splits:
        content = d.page_content
        meta = d.metadata or {}
        if "page" in meta and isinstance(meta["page"], int):
            meta["page"] = int(meta["page"]) + 1
        else:
            meta["page"] = "N/A"

        records.append({
            "doc_id": pdf_filename if 'pdf_filename' in locals() else "unknown.pdf",
            "content": content,
            "metadata": meta
        })

# Batch-embed with Gemini and insert into Supabase
print(f"Processing {len(records)} records in batches of {BATCH_SIZE} for embedding and upload to '{TABLE_NAME}'...")
inserted_count = 0

for i in tqdm(range(0, len(records), BATCH_SIZE), desc="Uploading to Supabase"):
    batch_records = records[i:i+BATCH_SIZE]
    texts_to_embed = [r["content"] for r in batch_records]

    if not texts_to_embed:
        continue

    try:
        # Generate embeddings using the new Client SDK
        response = client.models.embed_content(
            model=EMBED_MODEL,
            contents=texts_to_embed,
            config=types.EmbedContentConfig(
                output_dimensionality=EMBED_DIMS,
                task_type="RETRIEVAL_DOCUMENT"
            )
        )

        # Extract embeddings from the response object
        # The new SDK returns a list of embedding objects
        embeddings_list = [e.values for e in response.embeddings]

        # Validation check for the first batch
        if i == 0 and embeddings_list and len(embeddings_list[0]) != EMBED_DIMS:
             print(f"Warning: Model returned {len(embeddings_list[0])} dimensions, expected {EMBED_DIMS}.")

    except Exception as e:
        print(f"\nError getting embeddings for batch {i//BATCH_SIZE}: {e}")
        continue

    # Prepare rows for Supabase insertion
    rows_to_insert = []
    for r_data, emb_vec in zip(batch_records, embeddings_list):
        rows_to_insert.append({
            "doc_id": r_data["doc_id"],
            "content": r_data["content"],
            "embedding": emb_vec,
            "metadata": json.dumps(r_data["metadata"])
        })

    # Insert batch into Supabase
    if rows_to_insert:
        try:
            response = supabase.table(TABLE_NAME).insert(rows_to_insert).execute()
            if hasattr(response, 'error') and response.error:
                print(f"\nSupabase insert error for batch {i//BATCH_SIZE}: {response.error}")
            else:
                inserted_count += len(rows_to_insert)
        except Exception as e:
            print(f"\nError inserting batch {i//BATCH_SIZE} into Supabase: {e}")

print(f"\nData upload complete. Successfully inserted {inserted_count} records into Supabase table '{TABLE_NAME}'.")

Preparing records for insertion into Supabase...
Processing 178 records in batches of 100 for embedding and upload to 'oldMutualDocs'...


Uploading to Supabase: 100%|██████████| 2/2 [00:05<00:00,  2.84s/it]


Data upload complete. Successfully inserted 178 records into Supabase table 'oldMutualDocs'.



