In [4]:
import os
import glob
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [5]:
try:
    load_dotenv()
    api_key = os.getenv("OpenAI_API_KEY")
except Exception as e:
    raise ValueError("Failed to load OpenAI API key. Please set the OpenAI_API_KEY environment variable.") from e

In [6]:
content = []
knowledge_folders = glob.glob("../knowledgebase/*")
for folder in knowledge_folders:
    folder_name = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, show_progress=True)
    documents = loader.load()
    for document in documents:
        document.metadata["doc_type"] = folder_name
        content.append(document)

100%|██████████| 3/3 [00:00<00:00, 2048.00it/s]
100%|██████████| 3/3 [00:00<00:00, 1781.53it/s]
100%|██████████| 3/3 [00:00<00:00, 2988.11it/s]
100%|██████████| 4/4 [00:00<00:00, 2875.27it/s]
100%|██████████| 3/3 [00:00<00:00, 3788.89it/s]


In [7]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(content)
len(chunks)

Created a chunk of size 1072, which is longer than the specified 1000
Created a chunk of size 1312, which is longer than the specified 1000
Created a chunk of size 1417, which is longer than the specified 1000
Created a chunk of size 1280, which is longer than the specified 1000
Created a chunk of size 1693, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000
Created a chunk of size 1280, which is longer than the specified 1000
Created a chunk of size 1211, which is longer than the specified 1000
Created a chunk of size 1095, which is longer than the specified 1000
Created a chunk of size 1135, which is longer than the specified 1000


105

In [8]:
db_name = "vector_db"
embeddings = OpenAIEmbeddings(api_key=api_key)

In [9]:
vectorstore = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings
)

In [10]:
total_vectors = vectorstore.index.ntotal
print(f"Total vectors in the vector store: {total_vectors}")

dimension = vectorstore.index.d
print(f"Dimension of the vectors in the vector store: {dimension}")

Total vectors in the vector store: 105
Dimension of the vectors in the vector store: 1536


In [11]:
MODEL = "gpt-3.5-turbo"

llm = ChatOpenAI(api_key=api_key, temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [12]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [13]:
view = gr.ChatInterface(chat, type="messages").launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.
