In [None]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [None]:
# imports for langchain and Chroma and plotly

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import AzureChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
load_dotenv()

db_name= "vector_db"
MODEL = 'gpt-4'
API_KEY = os.getenv("OPENAI_API_KEY")
API_ENDPOINT= os.getenv("end_point_api")

if not API_KEY or not API_ENDPOINT:
    print("something wrong with envs")
    

In [None]:
folders = glob.glob("knowledge-base/*")
encytp = {"encoding":"utf-8"}

documents = []

for folder in folders:
    doctype = folder.split('/')[-1]
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=encytp)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata['doctype'] = doctype
        documents.append(doc)



In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap= 200)
chunks = text_splitter.split_documents(documents=documents)
chunks

In [None]:
print(set(chunk.metadata['doctype'] for chunk in chunks))


In [None]:
# embeddings = OpenAIEmbeddings()
# embeddings
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
if os.path.exists(db_name):
    Chroma(db_name, embedding_function=embeddings).delete_collection()

In [None]:
import os
os.makedirs(db_name, exist_ok=True)
if not os.access(db_name, os.W_OK):
    raise PermissionError(f"The directory {db_name} is not writable.")

In [None]:
%pip install --upgrade chromadb==0.4.14

In [None]:
# vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(chunks, embedding=embeddings)
# print(f"Vectorstore created with {vectorstore._collection.count()} documents")
# vectorstore.similarity_search(
#     "most good employeer",
#     k=2
# )


In [None]:
db = vectorstore._collection

sample_embedding = db.get(limit=1, include=["embeddings"])["embeddings"][0]

len(sample_embedding)

In [None]:
import pandas as pd
vctodatabase = db.get(include=["embeddings", "documents", "metadatas"])
array = np.array(vctodatabase["embeddings"])
s = pd.DataFrame(array)

document = vctodatabase['documents']
doc_types = [metadata['doctype'] for metadata in vctodatabase['metadatas']]

doc_types
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]
colors


In [None]:
tsne = TSNE(n_components=2, random_state=1337)
reduced_vectors = tsne.fit_transform(array)

fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:10]}..." for t, d in zip(doc_types, document)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()


In [None]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(array)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, document)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import AzureChatOpenAI
from langchain_core.callbacks import StdOutCallbackHandler

llm = AzureChatOpenAI(temperature=0.7, model=MODEL,
                      api_version="2023-06-01-preview",
                      azure_endpoint=API_ENDPOINT,
                      max_tokens=None,)

history_memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


vector_db_retreive = vectorstore.as_retriever(search_kwargs={'k': 25})


conversation = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vector_db_retreive, memory=history_memory, callbacks=[StdOutCallbackHandler()])


In [None]:
def chat(mesg, history):
    result = conversation.invoke({"question":mesg})
    print(result)
    return result["answer"]

In [None]:
gr.ChatInterface(fn=chat, type="messages").launch()