# Expert Knowledge Worker
### This project is a question and answering agent based of exported WhatsApp chat messages in from a group chat

In [None]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr


In [None]:
# imports fomr langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
# importing the low cost model and database

MODEL = "gpt-5-nano"
db_name = "vector_db"

In [None]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
# Read in documents using LangChain's loaders
# Take only .txt files in the knowledge-base folder (not subfolders)

files = glob.glob("knowledge-base/*.txt")

print(files)

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

text_loader_kwargs = {'encoding': 'utf-8'}

# Load all .txt files from knowledge-base folder
doc_type = "knowledge-base"
loader = DirectoryLoader(
    "knowledge-base", 
    glob="*.txt",  # Only .txt files in root folder, not subfolders
    loader_cls=TextLoader, 
    loader_kwargs=text_loader_kwargs
)
documents = loader.load()

# Add metadata to all documents
documents = [add_metadata(doc, doc_type) for doc in documents]

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Total number of documents: {len(documents)}")

In [None]:
embeddings = OpenAIEmbeddings()
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()


vectorstore = Chroma.from_documents(
    documents=chunks, embedding=embeddings, persist_directory=db_name
)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
query = "Who is mentioned a lot?"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

In [None]:
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)