In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [2]:
#Imports for langchain
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings,ChatOpenAI
from langchain_chroma import Chroma
#from langchain.vectorstores import FAISS
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.schema import Document
from langchain.document_loaders import PyMuPDFLoader

In [3]:
model="gpt-4o-mini"
db_name="vector_db"

In [4]:
# Load env variables to .env file
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY","your-key")

In [5]:
folders = glob.glob("../data/contract_dataset/*")
documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    for file_path in glob.glob(f"{folder}/**/*.*", recursive=True):
        print(file_path)
        ext = os.path.splitext(file_path)[-1].lower()
        try:
            if ext in [".txt", ".md", ".csv"]:
                loader = TextLoader(file_path)
            elif ext == ".pdf":
                loader = PyMuPDFLoader(file_path)  # Use PDF-compatible loader
            elif ext == ".json":
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()
                documents.append(Document(page_content=content, metadata={"source": file_path, "doc_type": doc_type}))
    
            else:
                print(f"Skipping unsupported file: {file_path}")
                continue
            folder_docs = loader.load()
            for doc in folder_docs:
                doc.metadata["doc_type"] = doc_type
                documents.append(doc)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")

../data/contract_dataset/contracts/Schedule A – Service Credits.pdf
../data/contract_dataset/contracts/zoom_msa.pdf
../data/contract_dataset/policies/procurement_policy.txt


In [6]:
len(documents)

3

In [7]:
documents[0]

Document(metadata={'source': '../data/contract_dataset/contracts/Schedule A – Service Credits.pdf', 'file_path': '../data/contract_dataset/contracts/Schedule A – Service Credits.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.4', 'title': 'Schedule A – Service Credits', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Skia/PDF m138 Google Docs Renderer', 'creationDate': '', 'modDate': '', 'trapped': '', 'doc_type': 'contracts'}, page_content='Zoom guarantees a monthly uptime of 99.9%. In the event of SLA breaches, the following \nservice credits will apply: \n \n- Uptime 99.5% to 99.9%: 5% monthly service credit   \n- Uptime 99.0% to 99.5%: 10% monthly service credit   \n- Uptime below 99.0%: 20% monthly service credit   \n \nCredits are calculated based on the monthly invoice value. \n \nClaim Process: \n- Customer must submit a claim within 30 days of the reported SLA breach. \n- Zoom will validate the claim and apply the service credit to the following bi

### I am not chunking, as document looks small one.. lets see if required.

In [8]:
doc_types = set(doc.metadata["doc_type"] for doc in documents)
print(f"Document types:{', '.join(doc_types)}")

Document types:policies, contracts


In [9]:
## Put the chunks of data in to vector store that associates vector embeddings with each cunk
embeddings = OpenAIEmbeddings()

In [10]:
# Check if Chroma data store already exists - if exist then delete the collection to start from scratch
if(os.path.exists(db_name)):
    Chroma(persist_directory=db_name,embedding_function=embeddings).delete_collection()

In [11]:
vectorstore = Chroma.from_documents(documents=documents,embedding=embeddings,persist_directory=db_name)
print(f"Vectorstore created with: {vectorstore._collection.count()} documents.")

Vectorstore created with: 3 documents.


In [12]:
 #Get one vector and find how many dimentions in to it
collection = vectorstore._collection
sample_embedding = collection.get(limit=1,include=["embeddings"])["embeddings"][0]
print(sample_embedding)
dimentions = len(sample_embedding)
print(f"The vector have {dimentions:,} dimentions.")

[ 0.00985571 -0.03031264  0.01611385 ... -0.00599824  0.01104578
 -0.0127625 ]
The vector have 1,536 dimentions.


In [13]:
# Create a new chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model=model)

# Set up conversation memory
memory = ConversationBufferMemory(memory_key='chat_history',return_messages=True)

# The retriever is the abstraction over vectorstore that will be used in RAG
retriever = vectorstore.as_retriever()

# Putting it all together: set up conversation chanin with gpt 3.5 LLM, memory and vector store
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,retriever=retriever,memory=memory)

  memory = ConversationBufferMemory(memory_key='chat_history',return_messages=True)



###🔍 Contract Understanding Questions
🔍 Contract Understanding Questions
📄 Zoom MSA
What is the effective date of the Zoom agreement?

Does the Zoom contract auto-renew? If so, what are the terms?

What notice period is required to terminate the Zoom agreement?

What is the SLA uptime commitment mentioned in the Zoom MSA?

What happens if Zoom breaches the SLA?

How long does a customer have to claim service credits?

Does the Zoom MSA mention GDPR compliance?

Is there any reference to a document not included in the contract?

📄 Schedule A
What level of service credit is available if Zoom’s uptime falls to 99.2%?

What must a customer do to claim a service credit?

📜 Procurement Policy Alignment
Does the Zoom contract meet the procurement policy requirement for termination notice?

Does the Zoom MSA comply with the GDPR clause requirement from the procurement policy?

Is the Zoom contract aligned with SLA breach penalty expectations in the procurement policy?

Does the policy specify a timeline for contract review before renewal?

Does the procurement policy prohibit auto-renewals longer than 12 months?





In [14]:
## Gradio Magic
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]   

In [15]:
view = gr.ChatInterface(chat).launch()



* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.
