Summary:
This is the project from week 5. The intention was to create a vector db of my own files (from an external drive) which can be used in a RAG solution.
This includes a number of file types (docx, pdf, txt, epub...) and includes the ability to exclude folders.
With the OpenAI embeddings API limit of 300k tokens, it was also necessary to create a batch embeddings process so that there were multiple requests.
This was based on estimating the tokens with a text to token rate of 1:4, however it wasn't perfect and one of the batches still exceeded the 300k limit when running.
I found that the responses from the llm were terrible in the end!  I tried playing about with chunk sizes and the minimum # of chunks by llangchain and it did improve but was not fantastic.  I also ensured the metadata was sent with each chunk to help.
This really highlighted the real world challenges of implementing RAG!

In [None]:
!pip install docx2txt
!pip install ebooklib
!pip install python-pptx
!pip install pypdf

In [None]:
# imports

import os
import requests
from dotenv import load_dotenv
import glob
import gradio as gr
import time
from typing import List

In [None]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import (
    DirectoryLoader,
    Docx2txtLoader,
    TextLoader,
    PyPDFLoader,
    UnstructuredExcelLoader,
    BSHTMLLoader
)
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [None]:
# Load environment variables in a file called .env

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
# handling epubs

from ebooklib import epub
from bs4 import BeautifulSoup
from langchain.document_loaders.base import BaseLoader

class EpubLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> list[Document]:
        book = epub.read_epub(self.file_path)
        text = ''
        for item in book.get_items():
            if item.get_type() == epub.EpubHtml:
                soup = BeautifulSoup(item.get_content(), 'html.parser')
                text += soup.get_text() + '\n'

        return [Document(page_content=text, metadata={"source": self.file_path})]

In [None]:
# handling pptx

from pptx import Presentation

class PptxLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> list[Document]:
        prs = Presentation(self.file_path)
        text = ''
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text") and shape.text:
                    text += shape.text + '\n'

        return [Document(page_content=text, metadata={"source": self.file_path})]

In [None]:
# Class based version of document loader which can be expanded more easily for other document types.  (Currently includes file types: docx, txt (windows encoding), xlsx, pdfs, epubs, pptx)

class DocumentLoader:
    """A clean, extensible document loader for multiple file types."""
    
    def __init__(self, base_path="D:/*", exclude_folders=None):
        self.base_path = base_path
        self.documents = []
        self.exclude_folders = exclude_folders or []
        
        # Configuration for different file types
        self.loader_config = {
            'docx': {
                'loader_cls': Docx2txtLoader,
                'glob_pattern': "**/*.docx",
                'loader_kwargs': {},
                'post_process': None
            },
            'txt': {
                'loader_cls': TextLoader,
                'glob_pattern': "**/*.txt",
                'loader_kwargs': {"encoding": "cp1252"},
                'post_process': None
            },
            'pdf': {
                'loader_cls': PyPDFLoader,
                'glob_pattern': "**/*.pdf",
                'loader_kwargs': {},
                'post_process': None
            },
            'xlsx': {
                'loader_cls': UnstructuredExcelLoader,
                'glob_pattern': "**/*.xlsx",
                'loader_kwargs': {},
                'post_process': None
            },
            'html': {
                'loader_cls': BSHTMLLoader,
                'glob_pattern': "**/*.html",
                'loader_kwargs': {},
                'post_process': None
            },
            'epub': {
                'loader_cls': EpubLoader,
                'glob_pattern': "**/*.epub",
                'loader_kwargs': {},
                'post_process': self._process_epub_metadata
            },
            'pptx': {
                'loader_cls': PptxLoader,
                'glob_pattern': "**/*.pptx",
                'loader_kwargs': {},
                'post_process': None
            }
        }
    
    def _get_epub_metadata(self, file_path):
        """Extract metadata from EPUB files."""
        try:
            book = epub.read_epub(file_path)
            title = book.get_metadata('DC', 'title')[0][0] if book.get_metadata('DC', 'title') else None
            author = book.get_metadata('DC', 'creator')[0][0] if book.get_metadata('DC', 'creator') else None
            return title, author
        except Exception as e:
            print(f"Error extracting EPUB metadata: {e}")
            return None, None
    
    def _process_epub_metadata(self, doc) -> None:
        """Post-process EPUB documents to add metadata."""
        title, author = self._get_epub_metadata(doc.metadata['source'])
        doc.metadata["author"] = author
        doc.metadata["title"] = title
    
    def _load_file_type(self, folder, file_type, config):
        """Load documents of a specific file type from a folder."""
        try:
            loader = DirectoryLoader(
                folder, 
                glob=config['glob_pattern'], 
                loader_cls=config['loader_cls'],
                loader_kwargs=config['loader_kwargs']
            )
            docs = loader.load()
            print(f"  Found {len(docs)} .{file_type} files")
            
            # Apply post-processing if defined
            if config['post_process']:
                for doc in docs:
                    config['post_process'](doc)
            
            return docs
            
        except Exception as e:
            print(f"  Error loading .{file_type} files: {e}")
            return []
    
    def load_all(self):
        """Load all documents from configured folders."""
        all_folders = [f for f in glob.glob(self.base_path) if os.path.isdir(f)]

        #filter out excluded folders
        folders = []
        for folder in all_folders:
            folder_name = os.path.basename(folder)
            if folder_name not in self.exclude_folders:
                folders.append(folder)
            else:
                print(f"Excluded folder: {folder_name}")
        
        print("Scanning folders (directories only):", folders)
        
        self.documents = []
        
        for folder in folders:
            doc_type = os.path.basename(folder)
            print(f"\nProcessing folder: {doc_type}")
            
            for file_type, config in self.loader_config.items():
                docs = self._load_file_type(folder, file_type, config)
                
                # Add doc_type metadata to all documents
                for doc in docs:
                    doc.metadata["doc_type"] = doc_type
                    self.documents.append(doc)
        
        print(f"\nTotal documents loaded: {len(self.documents)}")
        return self.documents
    
    def add_file_type(self, extension, loader_cls, glob_pattern=None, 
                     loader_kwargs=None, post_process=None):
        """Add support for a new file type."""
        self.loader_config[extension] = {
            'loader_cls': loader_cls,
            'glob_pattern': glob_pattern or f"**/*.{extension}",
            'loader_kwargs': loader_kwargs or {},
            'post_process': post_process
        }

# load
loader = DocumentLoader("D:/*", exclude_folders=["Music", "Online Courses", "Fitness"])
documents = loader.load_all()

In [None]:
# create batches (this was required as the # of tokens was exceed the openai request limit)

def estimate_tokens(text, chars_per_token=4):
    """Rough estimate of tokens from character count."""
    return len(text) // chars_per_token

def create_batches(chunks, max_tokens_per_batch=250000):
    batches = []
    current_batch = []
    current_tokens = 0
    
    for chunk in chunks:
        chunk_tokens = estimate_tokens(chunk.page_content)
        
        # If adding this chunk would exceed the limit, start a new batch
        if current_tokens + chunk_tokens > max_tokens_per_batch and current_batch:
            batches.append(current_batch)
            current_batch = [chunk]
            current_tokens = chunk_tokens
        else:
            current_batch.append(chunk)
            current_tokens += chunk_tokens
    
    # Add the last batch if it has content
    if current_batch:
        batches.append(current_batch)
    
    return batches

def create_vectorstore_with_progress(chunks, embeddings, db_name, batch_size_tokens=250000):
    
    # Delete existing database if it exists
    if os.path.exists(db_name):
        print(f"Deleting existing database: {db_name}")
        Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
    
    # Create batches
    batches = create_batches(chunks, batch_size_tokens)
    print(f"Created {len(batches)} batches from {len(chunks)} chunks")
    
    # Show batch sizes
    for i, batch in enumerate(batches):
        total_chars = sum(len(chunk.page_content) for chunk in batch)
        estimated_tokens = estimate_tokens(''.join(chunk.page_content for chunk in batch))
        print(f"  Batch {i+1}: {len(batch)} chunks, ~{estimated_tokens:,} tokens")
    
    vectorstore = None
    successful_batches = 0
    failed_batches = 0
    
    for i, batch in enumerate(batches):
        print(f"\n{'='*50}")
        print(f"Processing batch {i+1}/{len(batches)}")
        print(f"{'='*50}")
        
        try:
            start_time = time.time()
            
            if vectorstore is None:
                # Create the initial vectorstore
                vectorstore = Chroma.from_documents(
                    documents=batch,
                    embedding=embeddings,
                    persist_directory=db_name
                )
                print(f"Created initial vectorstore with {len(batch)} documents")
            else:
                # Add to existing vectorstore
                vectorstore.add_documents(batch)
                print(f"Added {len(batch)} documents to vectorstore")
            
            successful_batches += 1
            elapsed = time.time() - start_time
            print(f"Processed in {elapsed:.1f} seconds")
            print(f"Total documents in vectorstore: {vectorstore._collection.count()}")
            
            # Rate limiting delay
            time.sleep(2)
            
        except Exception as e:
            failed_batches += 1
            print(f"Error processing batch {i+1}: {e}")
            print(f"Continuing with next batch...")
            continue
    
    print(f"\n{'='*50}")
    print(f"SUMMARY")
    print(f"{'='*50}")
    print(f"Successful batches: {successful_batches}/{len(batches)}")
    print(f"Failed batches: {failed_batches}/{len(batches)}")
    
    if vectorstore:
        final_count = vectorstore._collection.count()
        print(f"Final vectorstore contains: {final_count} documents")
        return vectorstore
    else:
        print("Failed to create vectorstore")
        return None

# include metadata
def add_metadata_to_content(doc: Document) -> Document:
    metadata_lines = []
    if "doc_type" in doc.metadata:
        metadata_lines.append(f"Document Type: {doc.metadata['doc_type']}")
    if "title" in doc.metadata:
        metadata_lines.append(f"Title: {doc.metadata['title']}")
    if "author" in doc.metadata:
        metadata_lines.append(f"Author: {doc.metadata['author']}")
    metadata_text = "\n".join(metadata_lines)

    new_content = f"{metadata_text}\n\n{doc.page_content}"
    return Document(page_content=new_content, metadata=doc.metadata)

# Apply to all documents before chunking
documents_with_metadata = [add_metadata_to_content(doc) for doc in documents]

# Chunking
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents_with_metadata)

# Embedding
embeddings = OpenAIEmbeddings()

# Store in vector DB
print("Creating vectorstore in batches...")
vectorstore = create_vectorstore_with_progress(
    chunks=chunks,
    embeddings=embeddings, 
    db_name=db_name,
    batch_size_tokens=250000
)

if vectorstore:
    print(f"Successfully created vectorstore with {vectorstore._collection.count()} documents")
else:
    print("Failed to create vectorstore")

In [None]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 200})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [None]:
# Wrapping that in a function

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

In [None]:
# Let's investigate what gets sent behind the scenes

# from langchain_core.callbacks import StdOutCallbackHandler

# llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# retriever = vectorstore.as_retriever(search_kwargs={"k": 200})

# conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])

# query = "Can you name some authors?"
# result = conversation_chain.invoke({"question": query})
# answer = result["answer"]
# print("\nAnswer:", answer)