In [1]:
import os
import sys

# Get the absolute path of the scripts directory
root_path_scripts = os.path.abspath(os.path.join(os.getcwd(), '../scripts/'))

# Add the scripts directory to sys.path
sys.path.append(root_path_scripts)

# Import the function from load-docs.py
from load_docs import load_document 

In [16]:
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
# from langchain.embeddings import SentenceTransformerEmbeddings
# from chromadb.utils import embedding_functions
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder  # Import CrossEncoder

import openai
from openai import OpenAI

# load env variable
load_dotenv()

openai_client = OpenAI()
openai_api_key = os.getenv('OPENAI_API_KEY')

In [4]:
# Load the document(s)
file_path = 'Robinson Advisory.docx'  # Replace with your actual file path
documents = load_document("../data/Robinson Advisory.docx")

In [7]:

# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)

# Initialize Embedding model
# embeddings = SentenceTransformerEmbeddingFunction()
# embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(documents=split_docs, embedding=embeddings)


In [8]:
# 5. Define the function to generate multiple related queries
def augment_multiple_query(query, model="gpt-4o-mini"):
    messages = [
        {
            "role": "system",
            "content": 
                       "You are a helpful expert contract advisor assistant. Your users are asking questions about legal contract."
                       "Suggest up to ten additional related questions to help the user find the information they need based on the provided question. "
                        "Suggest only short questions without compound sentences. Suggest a variety of questions that cover different aspects of the topic."
                        "Make sure they are complete questions, and that they are related to the original question."
                        "Output one question per line. Do not number the questions."
                       
                     

        },
        {"role": "user", "content": query}
    ]
    response = openai_client.chat.completions.create(
            model=model,
            messages=messages,
        )
    content = response.choices[0].message.content
    return content.split("\n")

In [11]:
original_query = "Is there a non-compete obligation to the Advisor?"

# 6. Generate multiple related queries
# original_query = "What is the termination notice?"
augmented_queries = augment_multiple_query(original_query)
queries = [original_query] + augmented_queries


What is a non-compete obligation?  
How is a non-compete clause enforced?  
What are the typical terms of a non-compete agreement?  
Can non-compete clauses vary by jurisdiction?  
What are the consequences of violating a non-compete?  
Are there any exceptions to non-compete agreements?  
How long do non-compete obligations last?  
What is the purpose of a non-compete obligation?  
Can an advisor negotiate a non-compete clause?  
What should I include in a non-compete agreement?
***************************************************************************
concatenated query
Is there a non-compete obligation to the Advisor?
What is a non-compete obligation?  
How is a non-compete clause enforced?  
What are the typical terms of a non-compete agreement?  
Can non-compete clauses vary by jurisdiction?  
What are the consequences of violating a non-compete?  
Are there any exceptions to non-compete agreements?  
How long do non-compete obligations last?  
What is the purpose of a non-compet

In [13]:
print(f"generated query")
for query in augmented_queries:
    print(query)
# print(augmented_queries)/
# queries = [original_query] + [query for query in augmented_queries if query.strip()]
print("/***************************************************************************/")
print("concatenated query")
for query in queries:
    print(query)

generated query
What is a non-compete obligation?  
How is a non-compete clause enforced?  
What are the typical terms of a non-compete agreement?  
Can non-compete clauses vary by jurisdiction?  
What are the consequences of violating a non-compete?  
Are there any exceptions to non-compete agreements?  
How long do non-compete obligations last?  
What is the purpose of a non-compete obligation?  
Can an advisor negotiate a non-compete clause?  
What should I include in a non-compete agreement?
/***************************************************************************/
concatenated query
Is there a non-compete obligation to the Advisor?
What is a non-compete obligation?  
How is a non-compete clause enforced?  
What are the typical terms of a non-compete agreement?  
Can non-compete clauses vary by jurisdiction?  
What are the consequences of violating a non-compete?  
Are there any exceptions to non-compete agreements?  
How long do non-compete obligations last?  
What is the purpo

In [15]:
# Verify the queries are in expected format
for idx, query in enumerate(queries):
    if not isinstance(query, str):
        raise TypeError(f"Query {idx} is not a string: {query}")

# 7. Perform similarity search individually for each query
retrieved_documents = set()
for query in queries:
    try:
        # Perform similarity search with score for each individual query
        results = vectorstore.similarity_search_with_score(query=query, k=10)
        # Combine results from all queries
        for doc, score in results:
            if isinstance(doc.page_content, str):
                retrieved_documents.add(doc.page_content)
            else:
                print(f"Document content is not a string: {doc.page_content}")
    except Exception as e:
        print(f"Error during similarity search for query '{query}': {e}")


In [19]:

# Deduplicate the retrieved documents
unique_documents = list(retrieved_documents)  # Convert to list for further processing

# Re-ranking step using CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
pairs = [[original_query, doc] for doc in unique_documents]
scores = cross_encoder.predict(pairs)
for score in scores:
    print(score)


0.020573858
-0.7033981
4.598853
-5.757897
-9.366277
-0.25545642
6.6623535
-6.467209
2.3123977
0.8625152
-6.8046427


In [21]:
import numpy as np

In [22]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)

New Ordering:
6
2
8
9
0
5
1
3
7
10
4


In [23]:
# Print the retrieved documents with scores
ranked_documents = sorted(zip(scores, retrieved_documents), reverse=True, key=lambda x: x[0])

# Select top 5 documents after re-ranking
top_5_documents = [doc for score, doc in ranked_documents[:5]]

In [24]:

print("Top 5 Re-ranked Documents:")
for idx, doc in enumerate(top_5_documents, 1):
    print(f"Document {idx}:\n{doc}\n")


Top 5 Re-ranked Documents:
Document 1:
Non-Compete: During the term of engagement with the Company and for a period of 12 months thereafter, Advisor shall not be involved, as an employee, owner, contractor or otherwise, in any business, which competes with the Company’s Business, and shall not solicit and/or hire any employee and/or service provider of the Company, without the prior written consent of the Company.


Personnel: The Advisor may provide the Services only directly or through employees, contractors or agents (“Personnel”), which were specifically approved by the Company, and have committed in writing to confidentiality and ownership rights towards the Advisor and/or the Company, in a form restrictive enough to allow Advisor to comply and enforce the commitments under this undertaking (“Authorized Personnel”). Without limiting the foregoing, the Advisor is liable hereunder for any breach of this undertaking by any Advisor Personnel.

Document 2:
Advisor shall be solely respo

In [25]:

# 8. Define the RAG function using the re-ranked documents
def rag(query, retrieved_documents, model="gpt-4o-mini"):
    information = "\n\n".join([doc for doc in retrieved_documents])

    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful expert contract advisor assistant. Your users are asking questions about information "
                "contained in a legal contract. Answer the user's question using only the provided information."
            )
        },
        {
            "role": "user",
            "content": f"Question: {query}\n\nInformation:\n{information}\n\nAnswer:"
        }
    ]

    response = openai_client.chat.completions.create(
            model=model,
            messages=messages,
        )
    content = response.choices[0].message.content
    return content

# 9. Generate the final answer using the RAG function with re-ranked top 5 documents
Answer = rag(query=original_query, retrieved_documents=top_5_documents)

print("Answer:")
print(Answer)


Answer:
Yes, there is a non-compete obligation to the Advisor. According to the provided contract information, the Advisor is prohibited from being involved in any business that competes with the Company’s business during the term of engagement and for a period of 12 months thereafter, without the prior written consent of the Company. This includes not soliciting or hiring any employee or service provider of the Company.
