In [None]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [109]:
# imports for langchain, plotly and Chroma
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [None]:
load_dotenv(override=True)
os.environ["OPEN_API_KEY"] = os.getenv("OPEN_API_KEY")
print(os.getenv("OPEN_API_KEY"))

In [None]:
#LOADING THE DOCUMENTS
folders = glob.glob("test_documents/*")
documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.*", loader_cls=PyPDFLoader)
    folder_docs = loader.load()
    print(f"Loading {doc_type} documents...")
    print(f"Found {len(folder_docs)} documents in {doc_type} folder.")
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)
    

In [None]:
print(f"Total documents loaded: {len(documents)}")
print(documents[4].metadata)

In [None]:
# SPLITTING THE DOCUMENTS
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # initialized a splitter
chunks = text_splitter.split_documents(documents)  # split the documents into smaller chunks
print(f"Total chunks created: {len(chunks)}")
print(chunks[4])


In [None]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

In [None]:
# creating embeddings
embeddings = OpenAIEmbeddings(
    openai_api_key="" + os.getenv("OPEN_API_KEY"), #creating a embedding function using OpenAI API key
)
if os.path.exists(db_name):
    print(f"Loading existing vector store from {db_name}...")
    vector_store = Chroma(persist_directory=db_name, embedding_function=embeddings)

In [None]:

# creating a vector store
vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vector store created with {len(vector_store)} chunks.")

In [None]:
collection = vector_store._collection
sample = collection.get(limit=1, include=["embeddings", "metadatas"])["embeddings"][0]
print(sample)

In [None]:
# trying to see if we can visualize the embeddings

result = collection.get(include=["embeddings", "metadatas", "documents"])
vectors = np.array(result['embeddings'])

documents = result['documents']

# print(f"Total vectors retrieved: {len(vectors)}")
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
source_types = [metadata['source'] for metadata in result['metadatas']]


#assigniing colors based on document types
colors = [['red', 'blue'][['judgements', 'primary_sources'].index(t)] for t in doc_types]
print(colors)

['blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue',

In [114]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(source_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [115]:
#
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [112]:
query = "what should i do if a package that i got deliverred was an empty box?"
results = vector_store.similarity_search(query, k=3)
for i, doc in enumerate(results):
    print(f"Result {i+1}:\n{doc.page_content}\n")

Result 1:
which amounts to deficiency in service as the delivery was not as per the promises made by the
opposite parties. This is also the violation of policy of Amazon of Free Open Box delivery, which
includes the Open Box Inspection at the time of delivery to ensure the delivery of same product in
proper order. This service is available on pre-paid schedule delivery orders for TV, microwave,
washing machine and refrigerator products. In case the product found to be defective or otherwise
damage, the customer can refused to take the delivery can refund will be credited to his account. The
order of the complainant was also covered under the same policy having 10 days time to return the
item but despite that the delivery was not taken back and the complainant was informed by the
executive of opposite party No.1 that the complainant was not eligible to the OAKTER Smart Home
Kit and he has not clicked the same at the time of purchase of TV. It is pertinent to mention here

Result 2:
thro

In [None]:
"""
The following does not work right now
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL, api_key=os.getenv("OPEN_API_KEY"))

# Alternative - if you'd like to use Ollama locally, uncomment this line instead
# llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vector_store.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

query = "what should i do if a package that i got deliverred was an empty box?"
result = conversation_chain.invoke({"question": query})
print(result["answer"])
"""

'\n# create a new Chat with OpenAI\nllm = ChatOpenAI(temperature=0.7, model_name=MODEL, api_key=os.getenv("OPEN_API_KEY"))\n\n# Alternative - if you\'d like to use Ollama locally, uncomment this line instead\n# llm = ChatOpenAI(temperature=0.7, model_name=\'llama3.2\', base_url=\'http://localhost:11434/v1\', api_key=\'ollama\')\n\n# set up the conversation memory for the chat\nmemory = ConversationBufferMemory(memory_key=\'chat_history\', return_messages=True)\n\n# the retriever is an abstraction over the VectorStore that will be used during RAG\nretriever = vector_store.as_retriever()\n\n# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory\nconversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)\n\nquery = "what should i do if a package that i got deliverred was an empty box?"\nresult = conversation_chain.invoke({"question": query})\nprint(result["answer"])\n'


The class `LLMChain` was deprecated in LangChain 0.1.17 and will be removed in 1.0. Use :meth:`~RunnableSequence, e.g., `prompt | llm`` instead.


This class is deprecated. Use the `create_stuff_documents_chain` constructor instead. See migration guide here: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain/


The class `ConversationalRetrievalChain` was deprecated in LangChain 0.1.17 and will be removed in 1.0. Use :meth:`~create_history_aware_retriever together with create_retrieval_chain (see example in docstring)` instead.



ValidationError: 2 validation errors for ConversationalRetrievalChain
question_generator
  Field required [type=missing, input_value={'retriever': VectorStore...ecretStr('**********'))}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
llm
  Extra inputs are not permitted [type=extra_forbidden, input_value=ChatOpenAI(client=<openai...SecretStr('**********')), input_type=ChatOpenAI]
    For further information visit https://errors.pydantic.dev/2.11/v/extra_forbidden