## Step 2: Preprocessing — Convert PDF to Text

In [None]:
# Install necessary PDF parser
!pip install -q pymupdf

import fitz  # PyMuPDF
import os

def pdf_to_text(pdf_path, txt_output_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    with open(txt_output_path, "w", encoding="utf-8") as f:
        f.write(text)
    return text

# Example usage
os.makedirs("data/text", exist_ok=True)
pdf_text = pdf_to_text("data/sample_contract.pdf", "data/text/sample_contract.txt")
print(pdf_text[:1000])


## Step 3: Chunking Text

In [None]:
from langchain.text_splitter import CharacterTextSplitter

with open("/content/2ThemartComInc_19990826_10-12G_EX-10.10_6700288_EX-10.10_Co-Branding Agreement_ Agency Agreement.txt", "r", encoding="utf-8") as f:
    full_text = f.read()

splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_text(full_text)

print(f"Total chunks: {len(chunks)}")
print(chunks[0])




Total chunks: 39
CO-BRANDING AND ADVERTISING AGREEMENT

THIS CO-BRANDING AND ADVERTISING AGREEMENT (the "Agreement") is made as of June 21, 1999 (the "Effective Date") by and between I-ESCROW, INC., with its principal place of business at 1730 S. Amphlett Blvd., Suite 233, San Mateo, California 94402 ("i-Escrow"), and 2THEMART.COM, INC. having its principal place of business at 18301 Von Karman Avenue, 7th Floor, Irvine, California 92612 ("2TheMart").

1. DEFINITIONS.

(a) "CONTENT" means all content or information, in any medium, provided by a party to the other party for use in conjunction with the performance of its obligations hereunder, including without limitation any text, music, sound, photographs, video, graphics, data or software. Content provided by 2TheMart is referred to herein as "2TheMart Content" and Content provided by i-Escrow is referred to herein as "i-Escrow Content."


## Step 4: Embedding Chunks

In [None]:
!pip install -q sentence-transformers faiss-cpu

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, show_progress_bar=True)

# Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# Save index
os.makedirs("data/index", exist_ok=True)
faiss.write_index(index, "data/index/legal_faiss.index")
print("FAISS index created and saved.")


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


FAISS index created and saved.


## Step 5: Implement Retrieval-Augmented Generation (RAG)

In [None]:
!pip install -q openai

In [None]:

from google.colab import userdata
import google.generativeai as genai
import os

# Set your Gemini API key
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))  # 🔐 Replace with your actual key

# Initialize model
llm = genai.GenerativeModel("gemini-2.5-pro")

def retrieve_top_k(query, model, index, texts, k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    return [texts[i] for i in indices[0]]

def generate_answer_gemini(question, top_chunks):
    context = "\n\n".join(top_chunks)
    prompt = f"""You are a legal contract assistant. Answer the question below using only the given context.

Context:
{context}

Question: {question}

Answer:"""

    response = llm.generate_content(prompt)
    return response.text

In [None]:
question = "which category does this contract belongs to?"
top_chunks = retrieve_top_k(question, model, index, chunks)
answer = generate_answer_gemini(question, top_chunks)

print("✅ Gemini's Answer:", answer)

✅ Gemini's Answer: Based on the context provided, this contract belongs to the category of an **Independent Contractor Agreement**.

Section 12.5 explicitly states: "The parties are independent contractors, and no agency, partnership, joint venture, employee- employer or franchisor-franchisee relationship is intended or created by this Agreement."
