# Expert Knowledge Worker
- A question answering agent that is an expert knowledge worker
- The agent needs to be accurate and the solution should be low cost.
- This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.

# Import Libraries and Environment Set Up

In [None]:
# Import necessary libraries
import os  # OS module for file and directory operations
import glob  # Glob module for pattern matching file paths
from dotenv import load_dotenv  # Load environment variables from a .env file
import gradio as gr  # Gradio for building interactive UI components

# Imports for LangChain and Chroma
from langchain.document_loaders import DirectoryLoader, TextLoader  # Load documents from directories or text files
from langchain.text_splitter import CharacterTextSplitter  # Split text into smaller chunks for processing
from langchain.schema import Document  # Define the document structure

# Import OpenAI and Chroma components for embeddings and chat models
from langchain_openai import OpenAIEmbeddings, ChatOpenAI  # OpenAI-based embeddings and chat models
from langchain_chroma import Chroma  # Chroma for vector storage and retrieval

# Import memory and retrieval chain for conversation management
from langchain.memory import ConversationBufferMemory  # Store conversation history
from langchain.chains import ConversationalRetrievalChain  # Chain for conversational retrieval
from langchain.embeddings import HuggingFaceEmbeddings  # Use HuggingFace embeddings for text representation

# Choose a low-cost model and create a vector database name
MODEL = "gpt-4o-mini"  # Select a cost-effective OpenAI model
db_name = "vector_db"  # Define the vector database name

# Load environment variables from a .env file
load_dotenv()  # Automatically loads variables from .env file

# Set OpenAI API key from environment variables, providing a default if not found
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

# Split Documet into Smaller Chunks

In [None]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledge base
folders = glob.glob("knowledge-base/*")

def add_metadata(doc, doc_type):
    """Adds document type metadata to a document."""
    doc.metadata["doc_type"] = doc_type
    return doc

# Set text loader arguments for encoding detection
text_loader_kwargs = {'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)  # Extract document type from folder name
    loader = DirectoryLoader(
        folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs
    )  # Load markdown documents from the folder
    folder_docs = loader.load()  # Load documents from the directory
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])  # Add metadata and store documents

# Split documents into smaller chunks for better processing
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

# Output basic statistics
print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")


# Embedding

In [None]:
# Store document chunks in a Vector Database with associated Vector Embeddings
# Chroma is a popular open-source vector database based on SQLite

embeddings = OpenAIEmbeddings()  # Use OpenAI embeddings

# If you prefer free vector embeddings from HuggingFace sentence-transformers,
# replace OpenAIEmbeddings() with:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete existing vector database if it already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create and persist vector store
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

# Output vector store statistics
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

- Alternatively, switch out Chroma for FAISS (if desired)
- FAISS is another efficient vector database
- from langchain.vectorstores import FAISS
- vectorstore = FAISS.from_documents(chunks, embedding=embeddings)

In [None]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

# RAG Using LangChain 

In [None]:

# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# Alternative - if you'd like to use Ollama locally, uncomment this line instead
# llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

Now we will bring this up in Gradio using the Chat interface -
A quick and easy way to prototype a chat with an LLM

# Wrapping that in a function

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]
# Put it in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

# Debugging and investigating incorrect answers

In [None]:
from langchain_core.callbacks import StdOutCallbackHandler

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()]
)

query = "Who received the NFL MVP award in 2019?"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)

# Factors that can improve RAG performance:
# - Increase or decrease the chunk size
# - Increase or decrease the chunk overlap
# - Increase the number of relevant contexts retrieved

# Adjusting the number of retrieved chunks (k) for better performance
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

# Define chat function for Gradio UI
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

# Deploy with Gradio
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)