In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from openai import OpenAI
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

if api_key:
    print('API key loaded')
else:
    print('API key loading issue')

MODEL = "gpt-4o-mini"
db_name = "vector_db"
openai = OpenAI()

API key loaded


In [3]:
folders = glob.glob("knowledge-base/*")

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"})
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [4]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

Created a chunk of size 1088, which is longer than the specified 1000


In [5]:
embeddings = OpenAIEmbeddings()

In [6]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [7]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

In [29]:
streaming_handler = StreamingStdOutCallbackHandler()

llm = ChatOpenAI(temperature=0.7, model_name=MODEL, streaming=True, callbacks=[streaming_handler])

memory = ConversationBufferMemory(memory_key="chat_history", input_key="question")

retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [30]:
query = "Can you describe Insurellm in a few sentences"
result = conversation_chain.invoke({"question":query})
print(result['answer'])

Insurellm is an innovative insurance tech firm founded in 2015 by Avery Lancaster, designed to disrupt the insurance industry with its technology-driven products. The company has developed four key software products: Carllm for auto insurance, Homellm for home insurance, Rellm for the reinsurance sector, and Marketllm, a marketplace connecting consumers with insurance providers. With 200 employees and over 300 clients worldwide, Insurellm is committed to transforming the insurance landscape through innovation and reliability.Insurellm is an innovative insurance tech firm founded in 2015 by Avery Lancaster, designed to disrupt the insurance industry with its technology-driven products. The company has developed four key software products: Carllm for auto insurance, Homellm for home insurance, Rellm for the reinsurance sector, and Marketllm, a marketplace connecting consumers with insurance providers. With 200 employees and over 300 clients worldwide, Insurellm is committed to transformi