In [2]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [3]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter

In [4]:
MODEL = 'gpt-4o-mini'
db_name = 'vector_db'

In [5]:
folders = glob.glob("week5/knowledge-base/*")

In [6]:
folders

['week5/knowledge-base/products',
 'week5/knowledge-base/contracts',
 'week5/knowledge-base/company',
 'week5/knowledge-base/employees']

In [7]:
text_loader_kwargs = {'encoding': 'utf-8'}

In [14]:
documents = []

In [15]:
for folder in folders:
    doc_type = os.path.basename(folder)
    #print(doc_type)
    loader = DirectoryLoader(folder, glob = "**/*.md", loader_cls = TextLoader, loader_kwargs = text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [16]:
len(documents)

31

In [19]:
text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
chunks = text_splitter.split_documents(documents)

Created a chunk of size 1088, which is longer than the specified 1000


In [23]:
chunks[4]

Document(metadata={'source': 'week5/knowledge-base/products/Rellm.md', 'doc_type': 'products'}, page_content='Experience the future of reinsurance with Rellm, where innovation meets reliability. Let Insurellm help you navigate the complexities of the reinsurance market smarter and faster.')

In [24]:
chunks[4].metadata

{'source': 'week5/knowledge-base/products/Rellm.md', 'doc_type': 'products'}

In [25]:
chunks[4].metadata['doc_type']

'products'

In [27]:
doc_types = set()
for items in chunks:
    doc_types.add(items.metadata['doc_type'])

In [28]:
doc_types

{'company', 'contracts', 'employees', 'products'}

In [29]:
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [33]:
load_dotenv()

True

In [34]:
embeddings = OpenAIEmbeddings()

In [35]:
#small code to make sure we are not making duplicates

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function= embeddings).delete_collection()

In [36]:
vectorstore = Chroma.from_documents(documents = chunks, embedding = embeddings, persist_directory = db_name)
print(vectorstore)

<langchain_chroma.vectorstores.Chroma object at 0x32368c390>


In [37]:
vectorstore._collection.count()

123

In [41]:
collection = vectorstore._collection

In [42]:
collection

Collection(name=langchain)

In [43]:
type(collection)

chromadb.api.models.Collection.Collection

In [44]:
result = collection.get(include = ['embeddings','documents','metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [45]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [55]:
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [56]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [74]:
llm = ChatOpenAI(temperature = 0.7, model = MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(llm = llm, retriever = retriever, memory = memory)

In [75]:
query = "What are some good to know details about InsureLLM?"
result = conversation_chain.invoke({"question": query})
#print(result['answer'])

In [78]:
#creating a method to incorporate the RAG that I just made above
def chat(message, history):
    results =  conversation_chain.invoke({"question": message})
    return results['answer']


In [79]:
view = gr.ChatInterface(chat, type = "messages").launch()

* Running on local URL:  http://127.0.0.1:7868
* To create a public link, set `share=True` in `launch()`.


In [None]:
hHi.Hi.HowCa