## 📄 Hybrid RAG Pipeline: CUAD + Uploaded PDF

In [1]:
# Install necessary packages
!pip install -q pymupdf faiss-cpu sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m100.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
! unzip /content/data/full_contract_txt.zip

Archive:  /content/data/full_contract_txt.zip
   creating: full_contract_txt/
  inflating: full_contract_txt/2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.txt  
  inflating: full_contract_txt/ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEMENT.txt  
  inflating: full_contract_txt/ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINT VENTURE AGREEMENT.txt  
  inflating: full_contract_txt/ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGREEMENT.txt  
  inflating: full_contract_txt/ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT.txt  
  inflating: full_contract_txt/ADAPTIMMUNETHERAPEUTICSPLC_04_06_2017-EX-10.11-STRATEGIC ALLIANCE AGREEMENT.txt  
  inflating: full_contract_txt/ADIANUTRITION,INC_04_01_2005-EX-10.D2-RESELLER AGREEMENT.txt  
  inflating: full_contract_txt/ADMA BioManufacturing, LLC -  Amendment #3 to Manufacturing Agreement .txt  
  inflating: full_contract_txt/ADUROBIOTECH,INC_06_02_2020-EX-10.7-CONSULTING AGREEMENT(1

In [3]:
# Imports and configuration
import os
import fitz  # PyMuPDF
import faiss
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
from ollama import Client

# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
ollama_client = Client()

# Set chunking strategy
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Session state
session_state = {
    "active_index": "cuad",
    "cuad_index": None,
    "pdf_index": None,
    "cuad_chunks": [],
    "pdf_chunks": []
}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Preload CUAD index from multiple .txt files
def build_cuad_index(txt_dir="full_contract_txt"):
    all_texts = []
    txt_dir_path = Path(txt_dir)
    if not txt_dir_path.exists():
        print(f"Error: Directory '{txt_dir}' not found.")
        return None, []
    for file in txt_dir_path.glob("*.txt"):
        with open(file, "r", encoding="utf-8") as f:
            text = f.read()
            chunks = splitter.split_text(text)
            all_texts.extend(chunks)

    if not all_texts:
        print("Error: No text content found in the specified directory.")
        return None, []

    print(f"Processing {len(all_texts)} text chunks.")
    embeddings = embedding_model.encode(all_texts, show_progress_bar=True)
    print(f"Embeddings shape: {embeddings.shape}")

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    return index, all_texts

# Build or load
if os.path.exists("cuad_index.faiss") and os.path.exists("cuad_chunks.pkl"):
    print("🔄 Loading saved CUAD index...")
    session_state["cuad_index"] = faiss.read_index("cuad_index.faiss")
    import pickle
    with open("cuad_chunks.pkl", "rb") as f:
        session_state["cuad_chunks"] = pickle.load(f)
    print("✅ Loaded saved CUAD index.")
else:
    print("⚙️ Building CUAD index from .txt files...")
    session_state["cuad_index"], session_state["cuad_chunks"] = build_cuad_index()
    faiss.write_index(session_state["cuad_index"], "cuad_index.faiss")
    with open("cuad_chunks.pkl", "wb") as f:
        pickle.dump(session_state["cuad_chunks"], f)
    print("✅ Built and saved CUAD index.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Processing 27018 text chunks.


Batches:   0%|          | 0/845 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Embeddings shape: (27018, 384)
✅ CUAD index ready.


In [7]:
# Save FAISS index and chunks
faiss.write_index(session_state["cuad_index"], "cuad_index.faiss")

import pickle
with open("cuad_chunks.pkl", "wb") as f:
    pickle.dump(session_state["cuad_chunks"], f)

print("✅ CUAD index and chunks saved.")

✅ CUAD index and chunks saved.


In [8]:
# Process a new uploaded PDF (simulate with a sample file)
def process_pdf_to_index(pdf_path="data/sample_contract.pdf"):
    doc = fitz.open(pdf_path)
    text = "".join(page.get_text() for page in doc)
    chunks = splitter.split_text(text)
    embeddings = embedding_model.encode(chunks, show_progress_bar=True)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    return index, chunks

# Load and process sample PDF
session_state["pdf_index"], session_state["pdf_chunks"] = process_pdf_to_index()
session_state["active_index"] = "pdf"  # Switch to PDF as active
print("✅ PDF index created and set as active.")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ PDF index created and set as active.


  return forward_call(*args, **kwargs)


In [None]:
from google.colab import userdata
import google.generativeai as genai
import os

def retrieve_top_k(query, model, index, texts, k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    return [texts[i] for i in indices[0]]

def generate_answer(question, top_chunks):
    context = "\n\n".join(top_chunks)
    prompt = f"""You are a legal contract assistant. Answer the question below using only the given context.

Context:
{context}

Question: {question}

Answer:"""
    # Set Gemini API key
    genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))  

    # Initialize model
    llm = genai.GenerativeModel("gemini-2.5-pro")

    response = llm.generate_content(prompt)
    return response.text

# Ask a question using current active index
question = "What is the governing law of this agreement?"
if session_state["active_index"] == "pdf":
    top_chunks = retrieve_top_k(question, embedding_model, session_state["pdf_index"], session_state["pdf_chunks"])
else:
    top_chunks = retrieve_top_k(question, embedding_model, session_state["cuad_index"], session_state["cuad_chunks"])

answer = generate_answer(question, top_chunks)
print("🧠 Answer:", answer)

  return forward_call(*args, **kwargs)


🧠 Answer: Based on the context provided, Section 23, "Governing Law," states: "This Agreement will be governed in all respects by the laws of the State of Delaware, including its conflict with law provisions."
