In [1]:
import pandas as pd
import numpy as np
import textwrap
import re
# Core LangChain/HuggingFace RAG Imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# SentenceTransformer is replaced by HuggingFaceEmbeddings(model_name=...)
# faiss is replaced by FAISS from langchain_community
# We will still need numpy for the internal dataframe processing
# We will use the already configured SentenceTransformer model name
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
K = 3 # Number of top results to retrieve

# Initialize the shared embedding model once
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# --- RAG 1: Ideal Clauses (The 'Gold Standard' for Comparison) ---
def build_ideal_clauses_retriever(data_directory="./TA_data"):
    """
    Loads, chunks, and indexes the ideal tenancy agreement PDFs (RAG 1 source).
    Returns a LangChain FAISS Retriever.
    """
    print("\n--- BUILDING RAG 1: IDEAL CLAUSES RETRIEVER ---")
    
    # 1. Load Documents
    loader = DirectoryLoader(
        path=data_directory,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True
    )
    all_documents = loader.load()
    print(f"Loaded {len(all_documents)} document pages.")

    # 2. Chunk Documents (using the custom, structure-aware splitter)
    custom_separators = [
        "\n\n",
        r"\n\s*[A-Z]+\s+\d*\s*\.",
        r"\n\s*\d+\.\d*\s*",
        r"\n\s*\([a-zA-Z0-9]+\)\s*",
        "\n", " ", ""
    ]
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=200,
        separators=custom_separators,
        is_separator_regex=True
    )
    all_chunks = text_splitter.split_documents(all_documents)
    print(f"Split into {len(all_chunks)} chunks.")

    print("\n--- IDEAL CLAUSES CHUNK PREVIEW (First 3) ---")
    for i, chunk in enumerate(all_chunks[:3]):
        print(f"Chunk {i+1} (Length: {len(chunk.page_content)}):")
        # Print the first 200 characters to keep the output manageable
        print(f"  {chunk.page_content[:200].replace('\n', ' ')}...") 
        print(f"  Metadata: {chunk.metadata}")
        print("-" * 20)


    # 3. Create Vector Store and Retriever
    vectorstore = FAISS.from_documents(all_chunks, embeddings)
    print("FAISS index for Ideal Clauses created successfully.")
    
    # Return the retriever instance
    return vectorstore.as_retriever(search_kwargs={"k": K})

In [6]:
from langchain_core.documents import Document # Import required for conversion

# --- RAG 2: General Q&A (Excel Source) ---
def build_general_qa_retriever(file_path):
    """
    Loads data from the Excel file, converts it to Documents, and creates a FAISS retriever.
    """
    print("\n--- BUILDING RAG 2: GENERAL Q&A RETRIEVER ---")
    
    # 1. Load and process data (using the existing logic)
    try:
        df = pd.read_excel(file_path, header=1)
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Using dummy data.")
        df = pd.DataFrame({
            'Question': ["How do I know if my tenancy agreement is valid?"],
            'Answer / Explanation': ["Must be in writing, signed by both parties, and include essential terms."],
            'Legal Commentary': ["Only agreements containing essential terms are enforceable."],
            'Government Regulation / Explanation': ["N/A"]
        })
    
    # 2. Convert Data Rows into LangChain Documents
    documents = []
    for index, row in df.iterrows():
        content = textwrap.dedent(f"""
            Question: {row.get('Question', 'N/A')}
            Answer/Explanation: {row.get('Answer / Explanation', 'N/A')}
            Legal Context: {row.get('Legal Commentary', 'N/A')}
            Regulation/Source: {row.get('Government Regulation / Explanation', 'N/A')}
        """).strip()
        
        # Create a LangChain Document
        doc = Document(
            page_content=content,
            metadata={"source": file_path, "row_index": index}
        )
        documents.append(doc)
        
    print(f"Created {len(documents)} General Q&A Documents.")

    print("\n--- GENERAL Q&A CHUNK PREVIEW (First 3) ---")
    for i, doc in enumerate(documents[:3]):
        print(f"Chunk {i+1}:")
        print(doc.page_content)
        print(f"  Metadata: {doc.metadata}")
        print("-" * 20)

    # 3. Create Vector Store and Retriever
    vectorstore = FAISS.from_documents(documents, embeddings)
    print("FAISS index for General Q&A created successfully.")
    
    return vectorstore.as_retriever(search_kwargs={"k": K})

In [None]:
# Testing RAG 1 and 2

# --- Configuration ---
FILE_NAME = 'Database Requirements.xlsx'
DATA_DIRECTORY = "./TA_data" 

# Initialize shared embedding model once
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)

if __name__ == '__main__':
    # 1. BUILD THE TWO RAG SYSTEMS
    ideal_clauses_retriever = build_ideal_clauses_retriever(DATA_DIRECTORY)
    general_qa_retriever = build_general_qa_retriever(FILE_NAME)

    print("\n✅ BOTH RAG SYSTEMS ARE READY.")
    print("--------------------------------------------------")
    
    # --- SIMULATE THE PIPELINE EXECUTION ---
    
    # --- PHASE 1: CLAUSE IDENTIFICATION & COMPARISON (RAG 1) ---
    print("\n--- PHASE 1: COMPARING USER AGREEMENT CLAUSES ---")
    
    # 1.1 Simulate a single clause from the user's uploaded document
    # In the full app, you would run the text_splitter on the user upload here
    user_clause_to_check = "(b) SECURITY DEPOSIT: The Tenant shall pay the Landlord a security deposit equal to $3,000. This deposit will not accrue interest."
    
    # 1.2 Use RAG 1 to retrieve the Ideal Clause context
    comparison_context = ideal_clauses_retriever.invoke(user_clause_to_check)
    
    # 1.3 LLM Call for Feedback (Simulated)
    print(f"\nUser Clause: {user_clause_to_check}")
    print("\n[SIMULATED LLM COMPARISON & FEEDBACK GENERATION]")
    print(f"Retrieved Ideal Context for LLM:")
    for doc in comparison_context:
        print(f"  - Length: {len(doc.page_content)}. Source: {doc.metadata.get('source', 'Unknown')}")
        
    # *A real LLM would now compare the user clause to this context and generate a report.*

    # --- PHASE 2: POST-FEEDBACK Q&A (RAG 2) ---
    print("\n\n--- PHASE 2: GENERAL Q&A ---")
    
    # 2.1 User enters a follow-up question
    user_qa_query = "What rules apply to my landlord entering my rental?"
    
    # 2.2 Use RAG 2 to retrieve the General Q&A context
    qa_context = general_qa_retriever.invoke(user_qa_query)
    
    # 2.3 LLM Call for Answer (Simulated)
    print(f"\nUser Q&A Query: {user_qa_query}")
    print("\n[SIMULATED LLM ANSWER GENERATION]")
    print(f"Retrieved General Q&A Context for LLM:")
    for doc in qa_context:
        print(f"  - Length: {len(doc.page_content)}. Source: {doc.metadata.get('source', 'Unknown')}")
        
    # *A real LLM would now synthesize these into a single, cohesive answer.*


--- BUILDING RAG 1: IDEAL CLAUSES RETRIEVER ---


  0%|          | 0/7 [00:00<?, ?it/s]Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
 57%|█████▋    | 4/7 [00:00<00:00, 22.32it/s]Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 5 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 70 0 (offset 0)
Ignoring wrong pointing object

Loaded 26 document pages.
Split into 108 chunks.

--- IDEAL CLAUSES CHUNK PREVIEW (First 3) ---
Chunk 1 (Length: 903):
  RENTAL AGREEMENT PROBLEMATIC RENTAL TERMS LANDLORD: ABC Property Company TENANT: John Doe RENTAL AMOUNT: S$3,500 per month SECURITY DEPOSIT: S$17,500 (5 months rent) - EXCESSIVE! LEASE PERIOD: 12 mont...
  Metadata: {'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-08-31T22:29:15+08:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-08-31T22:29:15+08:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'TA_data\\TA1.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}
--------------------
Chunk 2 (Length: 879):
  RESIDENTIAL TENANCY AGREEMENT (Singapore) 1. PARTIES TO THE AGREEMENT LANDLORD: Mr. Tan Ah Beng NRIC: S1234567A Address: 123 Orchard Road, #15-01, Singapore 238123 Contact: +65 9123 4567 TENANT: Ms. S...
  Metadata: {'producer': 'ReportLab PDF Library - 

In [None]:
# Step 1: Ingestion and Clause identification. 
## User TA clauses: Split user uploaded TA into meangingful clauses  

# Step 2: Clause Comparision. RAG for Tenancy Agreement
## Vector store: RAG 1 - Ideal Clauses from reference TAs
## Compare and retrieve: Content of user clause compared to most relevant ideal clauses
## LLM evaluation: Use LLM to compare user clause and ideal clauses 

# Step 3: Post-Feedback Q and A. RAG for Q and A
## Vector store: RAG 2 - Q and A from Database Requirements.xlsx
## User interaction

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document # Needed to wrap the user text
from langchain_community.document_loaders import PyPDFLoader
import os # To handle file paths

def load_and_extract_pdf_text(file_path: str) -> str:
    """
    Checks if a file is a PDF, loads it, and extracts all text content.
    
    Args:
        file_path: The local path to the user's uploaded file.

    Returns:
        A single string containing all text from the PDF.

    Raises:
        ValueError: If the file is not found or is not a PDF.
    """
    print(f"\n--- Loading and Extracting Text from: {file_path} ---")

    # 1. Basic File Check
    if not os.path.exists(file_path):
        raise ValueError(f"Error: File not found at path: {file_path}")

    # 2. PDF Extension Check (Simple approach)
    if not file_path.lower().endswith('.pdf'):
        raise ValueError(f"Error: File is not a PDF ('.pdf' extension required).")

    try:
        # 3. Use LangChain's PyPDFLoader for robust text extraction
        loader = PyPDFLoader(file_path)
        
        # Load all pages as a list of Document objects
        pages = loader.load()
        print(f"Successfully loaded {len(pages)} pages.")

        # 4. Concatenate all page content into a single string
        full_text = "\n\n".join(page.page_content for page in pages)
        
        # Simple cleanup (optional, but helps with messy PDF parsing)
        full_text = re.sub(r'\s{2,}', ' ', full_text) # Replace multiple spaces/newlines with single space
        full_text = re.sub(r'(\n\s*){2,}', '\n\n', full_text) # Preserve paragraph breaks

        print("Text extraction complete.")
        return full_text
    
    except Exception as e:
        # Catch errors during PDF parsing
        raise RuntimeError(f"An error occurred during PDF text extraction: {e}")

def split_user_document(user_uploaded_text: str, source_name: str = "User TA") -> list[Document]:
    """
    Splits the raw text of the user-uploaded tenancy agreement into clause-level chunks.

    Args:
        user_uploaded_text: The raw string content of the user's document.
        source_name: A metadata tag to identify the source (e.g., the filename).

    Returns:
        A list of LangChain Document objects, one for each clause/chunk.
    """
    print(f"\n--- SPLITTING USER DOCUMENT: {source_name} ---")

    # The same structure-aware separators used for your Ideal Clauses (RAG 1)
    custom_separators = [
        "\n\n",
        r"\n\s*[A-Z]+\s+\d*\s*\.",
        r"\n\s*\d+\.\d*\s*",
        r"\n\s*\([a-zA-Z0-9]+\)\s*",
        "\n", " ", ""
    ]

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=200,
        separators=custom_separators,
        is_separator_regex=True
    )

    # 1. Convert the single raw string into a list of Documents (one initial document)
    initial_document = [
        Document(page_content=user_uploaded_text, metadata={"source": source_name})
    ]

    # 2. Split the document based on the clause structure
    user_chunks = text_splitter.split_documents(initial_document)
    
    print(f"User TA split into {len(user_chunks)} clause-level chunks.")
    
    # Optional: Print a preview of the first few chunks
    for i, chunk in enumerate(user_chunks[:3]):
        print(f"  Chunk {i+1} (Length: {len(chunk.page_content)}): {chunk.page_content[:150].replace('\n', ' ')}...")
    
    return user_chunks