In [1]:
import os
import re
import pandas as pd
from typing import List
from tqdm import tqdm
import uuid
from lancedb.pydantic import LanceModel, Vector
import lancedb

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.embeddings import Embeddings

# === Configuration ===
DB_PATH = "./lancedb_data"
TABLE_NAME = "company_docs"
GEMINI_API_KEY = "AIzaSyDxSbsQYWF1usd_31b8ujowFknAsu43SYQ"

In [2]:
embeddings_model: Embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    task_type="RETRIEVAL_QUERY",
    google_api_key=GEMINI_API_KEY,
)
def get_embeddings(text: str) -> List[float]:
    return embeddings_model.embed_documents([text])[0]
import lancedb
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.embeddings import Embeddings
import os

# === Configuration ===
DB_PATH = "./Data"
embeddings_model: Embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    task_type="RETRIEVAL_QUERY",
    google_api_key=GEMINI_API_KEY,
)

# === Connect to LanceDB ===
db = lancedb.connect(DB_PATH)
company_table = db.open_table("company_docs")

In [4]:
def query_clause(query: str, k: int = 3):
    embedding = get_embeddings(query)
    print(f"\n🔍 Top {k} clause matching:\n➡️ {query}\n")

    # build the query, then execute with .to_list()
    builder = (
        company_table
        .search(embedding, vector_column_name="chunk_embedding")
        .limit(k)
    )
    results = builder.to_list()   # ← here we actually run the search :contentReference[oaicite:0]{index=0}

    for i, row in enumerate(results, start=1):
        print(row.keys())
        print(f"{i}. uuid: {row['uid']} title: {row['doc_title']} , chunk: {row['doc_chunk']}")
    return results
print(query_clause("Preliminary Confidentiality Agreement"))
print("-"*50)
print(query_clause("deliverables and time lines"))


🔍 Top 3 clause matching:
➡️ Preliminary Confidentiality Agreement

dict_keys(['uid', 'doc_title', 'doc_chunk', 'doc_title_embedding', 'chunk_embedding', 'sn', 'cn', 'snm', '_distance'])
1. uuid: 8dcb8ae3-b3d4-4a5e-ba1c-c732db0aaf49 title: Preliminary Confidentiality Agreement.txt , chunk: Fictitious Legal Document 1: Preliminary Confidentiality Agreement

PRELIMINARY CONFIDENTIALITY AGREEMENT

This Preliminary Confidentiality Agreement (the "Agreement") is made as of the 1st day of January, 2024 (the "Effective Date"), by and between AI LAW, a hypothetical technology company with its principal place of business in [Fictitious Location] ("AI LAW"), and Tech Innovators Inc., a hypothetical entity with its principal place of business in [Fictitious Location] ("Tech Innovators").
dict_keys(['uid', 'doc_title', 'doc_chunk', 'doc_title_embedding', 'chunk_embedding', 'sn', 'cn', 'snm', '_distance'])
2. uuid: bdc4405b-a384-4299-ba1c-950af76d5c5c title: Software Development Agreement.txt , chu

In [14]:
from neo4j import GraphDatabase

# Replace with your actual connection details
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "12345678"

# Cypher query to get the entire influence tree
QUERY = """
MATCH (target {uid: "31fec388-b312-4e0f-9e48-5127e9785c4d"})
OPTIONAL MATCH pathUp = (root)-[:INFLUENCES*]->(target)
WITH CASE 
        WHEN pathUp IS NULL THEN target
        ELSE head(nodes(pathUp))
     END AS root
MATCH tree = (root)-[:INFLUENCES*0..]->(descendants)
RETURN tree
"""

def fetch_influence_tree():
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    with driver.session() as session:
        result = session.run(QUERY)
        trees = [record["tree"] for record in result]
        return trees

# Example use
if __name__ == "__main__":
    tree_paths = fetch_influence_tree()
    for node in tree_paths[-1].nodes:
        print(f" -{node['title']}, {node['sn']}, {node['cn']}, {node['chunk']}")


 -Preliminary Confidentiality Agreement.txt, 1, 1.1, Definitions: 1.1 "Confidential Information" shall mean any and all information disclosed by the Disclosing Party to the Receiving Party, whether orally or in written, electronic, or other form, that is designated as confidential or that reasonably should be understood to be confidential given the nature of the information and the circumstances of its disclosure. Confidential Information may include, but is not limited to, business plans, technical data, product designs, financial information, customer lists, and marketing strategies.
 -Preliminary Confidentiality Agreement.txt, 2, 2.2, Confidentiality Obligations: 2.2 The Receiving Party shall not disclose any Confidential Information of the Disclosing Party to any third party without the prior written consent of the Disclosing Party, except to its employees, contractors, and advisors who have a need to know the information for the purpose of evaluating or carrying out the potential 

In [23]:
def query_clause(query: str, k: int = 3):
    embedding = get_embeddings(query)
    print(f"\n🔍 Top {k} clause matching:\n➡️ {query}\n")

    builder = (
        company_table
        .search(embedding, vector_column_name="chunk_embedding")
        .limit(k)
    )
    results = builder.to_list()

    output = []
    for i, row in enumerate(results, start=1):
        # print(row.keys())
        # print(f"{i}. uuid: {row['uid']} title: {row['doc_title']} , chunk: {row['doc_chunk']}")
        output.append({
            "id": row['uid'],
            "content": row['doc_chunk'],
            "title" : row["doc_title"] + " Section " + str(row["sn"]) + " Clause " + str(row["cn"]),
            "document_type" : "Agreement",
            "source" : row['doc_title'],
            "score" : row["_distance"]
        })
    return output

# Example usage

d = query_clause("deliverables and time lines")
for i in d:
    print(i)



🔍 Top 3 clause matching:
➡️ deliverables and time lines

{'id': '829e7afc-3703-4c68-89c1-ad6c3fb61478', 'content': ' Deliverables and Timelines: The project will be completed in phases, with specific milestones and deliverables for each phase as outlined in the project timeline attached hereto as Exhibit B and incorporated herein by reference. The section titled "Deliverables and Timelines" in the Original Agreement (as defined in Amendment No. 1) provides an overview of the timeline, but Exhibit B contains the detailed schedule.', 'title': 'Software Development Agreement.txt Section 3 Clause 1.0', 'document_type': 'Agreement', 'source': 'Software Development Agreement.txt', 'score': 0.5380796790122986}
{'id': '60204ea9-16d6-4a15-bf0f-5c945cadc527', 'content': ' Amendment to Section 3 (Deliverables and Timelines): Section 3 of the Original Agreement, titled "Deliverables and Timelines," is hereby amended to read as follows: "3. Deliverables and Timelines The project will now be comple

"content_preview": ref["chunk"][:300] if len(ref["chunk"])>300 else ref["chunk"],
                "title": " ".join(
                    part for part in [ref.get("part_title"), ref.get("chapter_title"), ref.get("section_title")]
                    if part and str(part).lower() != "nan"
                ).strip(),
                "document_type":"Law",
                "source": ref["source"],
                "score":ref["score"],
                "id":ref["id"],
                "content":ref["chunk"]

In [62]:
import pandas as pd
from typing import List
import json
from lancedb.pydantic import LanceModel, Vector
import lancedb
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.embeddings import Embeddings
import ast
import lancedb
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.embeddings import Embeddings
import subprocess
from neo4j import GraphDatabase

# === Configuration ===
DB_PATH = "./Data"
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "12345678"
GEMINI_API_KEY = "AIzaSyDxSbsQYWF1usd_31b8ujowFknAsu43SYQ"
genai.configure(api_key=GEMINI_API_KEY)

embeddings_model: Embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    task_type="RETRIEVAL_QUERY",
    google_api_key=GEMINI_API_KEY,
)

# === Connect to LanceDB ===
db = lancedb.connect(DB_PATH)
company_table = db.open_table("company_docs")


def get_embeddings(text: str) -> List[float]:
    return embeddings_model.embed_documents([text])[0]


def query_clause(query: str, k: int = 3):
    embedding = get_embeddings(query)
    # print(f"\n🔍 Top {k} clause matching:\n➡️ {query}\n")

    # build the query, then execute with .to_list()
    builder = (
        company_table
        .search(embedding, vector_column_name="chunk_embedding")
        .limit(k)
    )
    results = builder.to_list()   # ← here we actually run the search :contentReference[oaicite:0]{index=0}
    uids = []
    for i, row in enumerate(results, start=1):
        # print(row.keys())
        # print(f"{i}. uuid: {row['uid']} title: {row['doc_title']} , chunk: {row['doc_chunk']}")
        uids.append(row['uid'])
    return uids, results

def uid_llm_filter(query, dicts):
    prompt = f"""
you are a professional legal assistant. given a user query and a list of internal company document clauses, return those documents uids whose content mathces the query or may have implacation
for or from it. In case no exact mathces return uid for closest match. Return a list of the uids directly.
user query  : {query}
docs: {json.dumps(dicts)}
"""
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    return ast.literal_eval(response.text)

def js_driver(uids):
    args = ["node", "modified_neo4j_driver.js"] + uids
    subprocess.Popen(args)

def get_linked_clauses(uids, results):
    QUERY = """
    MATCH (target {uid: $uid})
    OPTIONAL MATCH pathUp = (root)-[:INFLUENCES*]->(target)
    WITH CASE 
            WHEN pathUp IS NULL THEN target
            ELSE head(nodes(pathUp))
         END AS root
    MATCH tree = (root)-[:INFLUENCES*0..]->(descendants)
    RETURN root.uid AS root_uid, tree
    """
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    collected = []
    seen_uids = set()
    root_uids = set()

    for uid in uids:
        with driver.session() as session:
            result = session.run(QUERY, uid=uid)
            for record in result:
                root_uid = record["root_uid"]
                root_uids.add(root_uid)
                tree_path = record["tree"]
                for node in tree_path.nodes:
                    node_uid = node.get("uid")
                    if node_uid not in seen_uids:
                        seen_uids.add(node_uid)
                        collected.append({
                            "uid": node_uid,
                            "id": node_uid,
                            "title": str(node.get("title")) + " Section " + str(node.get("sn")) + " clause " + str(node.get("cn")),
                            "content_preview": node.get("chunk")[:100] if node.get("chunk") and len(node.get("chunk")) > 100 else node.get("chunk"),
                            "content": node.get("chunk"),
                            "document_type": "Internal Agreement",
                            "source": str(node.get("title")),
                            "score": next((d['_distance'] for d in results if d.get('uid') == node_uid), 0.74)
                        })

    return list(root_uids), collected

def uids_from_query(query):
    uids, results = query_clause(query)
    uids = uid_llm_filter(query, results)
    uids, results = get_linked_clauses(uids, results)
    js_driver(uids)
    return results

In [63]:
results = uids_from_query("oral information exchange")

In [65]:
results

[{'uid': '21a08869-5534-46fd-a36a-5a06bb237424',
  'id': '21a08869-5534-46fd-a36a-5a06bb237424',
  'title': 'Preliminary Confidentiality Agreement.txt Section 2 clause 2.1',
  'content_preview': 'Confidentiality Obligations: 2.1 The Receiving Party agrees to protect the Confidential Information ',
  'content': 'Confidentiality Obligations: 2.1 The Receiving Party agrees to protect the Confidential Information of the Disclosing Party with the same degree of care that it uses to protect its own confidential information of like nature, but in no event less than a reasonable degree of care.',
  'document_type': 'Internal Agreement',
  'source': 'Preliminary Confidentiality Agreement.txt',
  'score': 0.7590070962905884},
 {'uid': 'da8c914e-2571-4248-81cb-fac9dbff3d42',
  'id': 'da8c914e-2571-4248-81cb-fac9dbff3d42',
  'title': 'Preliminary Confidentiality Agreement.txt Section 3 clause 1.0',
  'content_preview': ' Exclusions: The obligations under Section 2 shall not apply to any informatio

In [47]:
uids = ['66355a67-7a32-4f22-bc74-8ef4aa8816c8', '21a08869-5534-46fd-a36a-5a06bb237424']
def js_driver(uids):
    args = ["node", "modified_neo4j_driver.js"] + uids
    subprocess.Popen(args)
js_driver(uids)
print("control returned to python")

control returned to python


Selector wait complete
Executed query for UID: 66355a67-7a32-4f22-bc74-8ef4aa8816c8
Executed query for UID: 21a08869-5534-46fd-a36a-5a06bb237424
