In [21]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import textwrap
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import re

In [4]:
# Load the Excel file
try:
    df = pd.read_excel('Database Requirements.xlsx',header=1)
    print("Data imported successfully!")
    print(df.head()) # Display the first 5 rows
except FileNotFoundError:
    print("Error: The file was not found. Please check the file path.")
except Exception as e:
    print(f"An error occurred: {e}")

Data imported successfully!
   No.                                           Question  \
0    1    How do I know if my tenancy agreement is valid?   
1    2  What should I do before signing a tenancy agre...   
2    3  Can my landlord enter my property without notice?   
3    4  What are the consequences of not paying rent o...   
4    5    How can I terminate my tenancy agreement early?   

                                Answer / Explanation Relevant Clause ID(s)  \
0  Must be in writing, signed by both parties, an...                  C001   
1  Review all terms carefully; ensure clarity; co...                  C001   
2  Landlord must give reasonable notice (usually ...                  C002   
3  Late payment may result in penalties, eviction...                  C003   
4  Early termination allowed only if clause exist...                  C004   

                                    Legal Commentary  \
0  Only agreements containing essential terms are...   
1  Legal review reduces 

In [5]:
# --- Configuration ---
FILE_NAME = 'Database Requirements.xlsx'
MODEL_NAME = 'all-MiniLM-L6-v2' # A fast and effective embedding model
K = 3 # Number of top results to retrieve

# --- 1. Data Preparation and Chunking ---
def load_and_chunk_data(file_path):
    """Loads the Excel data and converts each row into a structured chunk."""
    try:
        # Assuming the first row is the header (default behavior of read_excel)
        df = pd.read_excel(file_path, header=1)
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Please create a dummy file or ensure the path is correct.")
        # Create a dummy DataFrame if the file isn't found for demonstration
        df = pd.DataFrame({
            'Question': ["How do I know if my tenancy agreement is valid?"],
            'Answer / Explanation': ["Must be in writing, signed by both parties, and include essential terms."],
            'Legal Commentary': ["Only agreements containing essential terms are enforceable."],
            'Government Regulation / Explanation': ["N/A"]
        })

    chunks = []
    for index, row in df.iterrows():
        chunk = textwrap.dedent(f"""
            Question: {row['Question']}
            Answer/Explanation: {row['Answer / Explanation']}
            Legal Context: {row['Legal Commentary']}
            Regulation/Source: {row['Government Regulation / Explanation']}
        """).strip()
        chunks.append(chunk)

    return chunks

# --- 2. Indexing and Embedding ---
def create_faiss_index(chunks, model):
    """Generates embeddings and creates a Faiss index."""
    print("Generating embeddings and creating Faiss index...")
    # Generate embeddings for all chunks
    embeddings = model.encode(chunks)

    # Convert embeddings to float32 NumPy array (Faiss requirement)
    D = embeddings.shape[1] # Dimension of the embeddings
    index = faiss.IndexFlatL2(D) # L2 is Euclidean distance (common for embeddings)

    # Add the embeddings to the index
    index.add(np.array(embeddings).astype('float32'))
    print(f"Index created with {index.ntotal} documents.")
    return index

# --- 3. Retrieval and Generation (Simulated) ---
def retrieve_context(query, index, chunks, model, k):
    """Retrieves the top k most relevant context chunks."""
    # Encode the user query
    query_vector = model.encode([query]).astype('float32')

    # Search the Faiss index
    D, I = index.search(query_vector, k) # D=Distances, I=Indices
    
    # Retrieve the corresponding text chunks
    retrieved_context = [chunks[i] for i in I[0]]
    
    return retrieved_context

def generate_response_rag(query, retrieved_context):
    """
    Simulates the LLM Generation step by compiling a structured output.
    In a real RAG system, this is where you'd call an LLM API (e.g., GPT-4).
    """
    print("\n--- RAG Output (Simulated LLM) ---")
    print(f"User Query: {query}")
    print("-" * 30)
    
    # In a real system, the LLM synthesizes a single answer.
    # Here, we just print the relevant context to show the retrieval worked.
    
    print("Retrieved Context Chunks (LLM Input):")
    for i, chunk in enumerate(retrieved_context):
        print(f"\n[Chunk {i+1}]")
        print(chunk)
        
    print("\n\n*Note: A real LLM would now synthesize these chunks into a single, cohesive answer.*")

In [6]:
# --- Configuration ---
FILE_NAME = 'Database Requirements.xlsx'
MODEL_NAME = 'all-MiniLM-L6-v2' # A fast and effective embedding model
K = 3 # Number of top results to retrieve

# --- Main RAG Pipeline Execution ---
if __name__ == '__main__':
    # 1. Load Data and create Chunks
    data_chunks = load_and_chunk_data(FILE_NAME)

    # 2. Initialize Model and Create Index
    embedding_model = SentenceTransformer(MODEL_NAME)
    faiss_index = create_faiss_index(data_chunks, embedding_model)

    # 3. Test the RAG system with a query
    user_query = "What rules apply to my landlord entering my rental?"
    
    # 3.1 Retrieval
    context = retrieve_context(user_query, faiss_index, data_chunks, embedding_model, K)
    
    # 3.2 Generation (Simulated)
    generate_response_rag(user_query, context)

Generating embeddings and creating Faiss index...
Index created with 53 documents.

--- RAG Output (Simulated LLM) ---
User Query: What rules apply to my landlord entering my rental?
------------------------------
Retrieved Context Chunks (LLM Input):

[Chunk 1]
Question: Can my landlord enter my property, what is considered reasonable notice, and can I refuse entry?
Answer/Explanation: Landlord must give reasonable notice, typically 24 hours. Entry without notice is allowed only in emergencies. Tenants generally cannot refuse entry if notice is valid and purpose is legitimate.
Legal Context: Court balances landlord's access right with tenant’s quiet enjoyment.
Regulation/Source: HDB/URA – Entry must respect tenant's occupancy rights; emergency access allowed

[Chunk 2]
Question: Can my landlord enter my property without notice?
Answer/Explanation: Landlord must give reasonable notice (usually 24h), except emergencies. It is recommended to document the notice period in the agreement.
L

In [22]:
# 1. Define the directory where your PDFs are stored
DATA_DIRECTORY = "./TA_data" 

# 2. Use DirectoryLoader to find all PDF files and apply the PyPDFLoader
loader = DirectoryLoader(
    path=DATA_DIRECTORY,
    glob="**/*.pdf",  # This pattern searches for all files ending in .pdf recursively
    loader_cls=PyPDFLoader,
    show_progress=True # Optional: shows a progress bar
)

# 3. Load all documents at once
print("Loading documents...")
all_documents = loader.load()
print(f"Loaded {len(all_documents)} document pages.")

# 4. Chunk the combined list of documents
custom_separators = [
    "\n\n",  # Prioritize splitting by new paragraphs/clauses (standard)
    r"\n\s*[A-Z]+\s+\d*\s*\.",  # Matches major headings like "1. PARTIES TO THE AGREEMENT" or "ARTICLE I." or "SECTION 2."
    r"\n\s*\d+\.\d*\s*",     # Matches "1.", "1.1.", "2. " (Numbers with optional sub-numbers and space)
    r"\n\s*\([a-zA-Z0-9]+\)\s*", # Matches "(a) ", "(b) ", "(1) ", "(ii) " (Letter/number in parentheses, space)
    "\n",    # Then split by single newlines (lines)
    " ",     # Then split by spaces (words)
    ""       # Finally, fall back to splitting by character
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200,
    separators=custom_separators,
    is_separator_regex=True
)
all_chunks = text_splitter.split_documents(all_documents)
print(f"Split into {len(all_chunks)} chunks.")

# 5. Create the Vector Store
# The vector store will automatically combine the vectors from all documents
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")
vectorstore = FAISS.from_documents(all_chunks, embeddings)

print("FAISS index created successfully with all documents.")

# Convert the vector store into a retriever
# search_kwargs={"k": 4} means it will retrieve the top 4 most similar chunks
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

Loading documents...


  0%|          | 0/7 [00:00<?, ?it/s]Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
 43%|████▎     | 3/7 [00:00<00:00, 28.91it/s]Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 5 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 70 0 (offset 0)
Ignoring wrong pointing object

Loaded 26 document pages.
Split into 108 chunks.
FAISS index created successfully with all documents.


In [25]:
# Testing the retriever with a sample query
query = "Give me an example of the obligation to pay rent clause?"

# The retriever takes the query, embeds it, searches the FAISS index, 
# and returns the top 'k' most relevant Document objects.
retrieved_docs = retriever.invoke(query)

# Display the content of the retrieved chunks
print(f"Retrieved {len(retrieved_docs)} chunks.")
for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Document Chunk {i+1} ---")
    print(doc.page_content[:1000] + "...") # Print the first 200 characters

Retrieved 3 chunks.

--- Document Chunk 1 ---
2. The Tenant hereby agrees with the Landlord as follows:
(a) RENT
To pay the Rent at the times and in the manner aforesaid without any deduction whatsoever....

--- Document Chunk 2 ---
4. The Landlord hereby agrees with the Tenant as follows:
(a) QUIET ENJOYMENT
The Tenant paying the Rents hereby reserved, performing and observing the terms and conditions herein contained
shall peaceably hold and enjoy the Premises during the tenancy without any interruption by the Landlord or any
person rightfully claiming under or in trust for the Landlord.
(b) PAYMENT OF PROPERTY TAX
To pay all property tax, rates and assessments in respect of the Premises other than those agreed to be paid by the
Tenant herein....

--- Document Chunk 3 ---
(h) TO INDEMNIFY THE LANDLORD
To be responsible for and to indemnify the Landlord from and against all claims and demands and against damage
occasioned to the Premises or any adjacent or neighboring premises or inju

In [24]:
# Assuming you want to look at the 10th chunk (index 9)
chunk_index = 9
if len(all_chunks) > chunk_index:
    specific_chunk = all_chunks[chunk_index]
    print(f"--- CHUNK {chunk_index} ---")
    print(specific_chunk.page_content)
    print(f"Metadata: {specific_chunk.metadata}")
else:
    print(f"There are less than {chunk_index + 1} chunks.")

--- CHUNK 9 ---
7.2 In respect of any change in the particulars, immigration status or employment status of the Tenant or the occupier(s), the Te nant 
shall inform the Landlord of the same in writing not less than fourteen (14) days prior to such change. If the change cannot be 
anticipated, the Tenant shall inform the Landlord as soon as practicable upon knowledge of such change. 
7.3 Where the Tenant notifies the Landlord of a change in occupiers of the Premises, the Landlord is required to conduct all the 
necessary due diligence checks on the original NRIC/passports/passes/permits, acquire  copies and conduct validity checks of the 
passes and identity documents of the new occupiers. 
8. No alteration or additional work is allowed in the premises without The Landlord’s permission.
9. The Tenant is not allowed to sublet the premises without The Landlord’s permission.
10. The premises / room shall not be occupied by more than 2 persons.
Metadata: {'producer': 'macOS Version 15.6 (Bu

In [None]:
# Step 1: Ingestion and Clause identification. 
## User TA clauses: Split user uploaded TA into meangingful clauses  

# Step 2: Clause Comparision. RAG for Tenancy Agreement
## Vector store: RAG 1 - Ideal Clauses from reference TAs
## Compare and retrieve: Content of user clause compared to most relevant ideal clauses
## LLM evaluation: Use LLM to compare user clause and ideal clauses 

# Step 3: Post-Feedback Q and A. RAG for Q and A
## Vector store: RAG 2 - Q and A from Database Requirements.xlsx
## User interaction