In [1]:
import os
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.readers.web import SimpleWebPageReader

In [4]:
import dotenv
dotenv.load_dotenv(override=True)

True

In [5]:
# Configure global settings
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = OpenAI(model="gpt-5-nano", temperature=0)
Settings.node_parser = SentenceSplitter(chunk_size=1000, chunk_overlap=100)


In [6]:
# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection("my_documents")

In [7]:
# Create vector store
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [8]:
# Create or load index
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context,
)

In [None]:
## Functions to add documents, directories, etc

def add_pdf(file_path: str):
    """Add a PDF file to the index."""
    from llama_index.readers.file import PDFReader
    
    reader = PDFReader()
    docs = reader.load_data(file_path)
    
    for doc in docs:
        doc.metadata["source"] = file_path
    
    index.insert_nodes(Settings.node_parser.get_nodes_from_documents(docs))
    print(f"Added {len(docs)} pages from {file_path}")


def add_url(url: str):
    """Add a web page to the index."""
    reader = SimpleWebPageReader(html_to_text=True)
    docs = reader.load_data([url])
    
    for doc in docs:
        doc.metadata["source"] = url
    
    index.insert_nodes(Settings.node_parser.get_nodes_from_documents(docs))
    print(f"Added content from {url}")

def add_urls(urls: list[str]):
    """Add multiple web pages to the index."""
    reader = SimpleWebPageReader(html_to_text=True)
    docs = reader.load_data(urls)
    
    for doc, url in zip(docs, urls):
        doc.metadata["source"] = url
    
    index.insert_nodes(Settings.node_parser.get_nodes_from_documents(docs))
    print(f"Added content from {len(urls)} URLs")


def add_text(text: str, source: str = "manual"):
    """Add raw text to the index."""
    doc = Document(text=text, metadata={"source": source})
    index.insert_nodes(Settings.node_parser.get_nodes_from_documents([doc]))
    print(f"Added text from {source}")


def add_directory(directory_path: str):
    """Add all supported files from a directory."""
    reader = SimpleDirectoryReader(directory_path)
    docs = reader.load_data()
    index.insert_nodes(Settings.node_parser.get_nodes_from_documents(docs))
    print(f"Added {len(docs)} documents from {directory_path}")

In [14]:
def get_stats():
    """Get collection statistics."""
    return {
        "total_chunks": chroma_collection.count()
    }


In [21]:
def reset():
    """Clear all documents from the index."""
    # Delete and recreate collection
    chroma_client.delete_collection("my_documents")
    global chroma_collection, vector_store, storage_context, index, query_engine, chat_engine
    
    chroma_collection = chroma_client.create_collection("my_documents")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        storage_context=storage_context,
    )
    query_engine = index.as_query_engine(similarity_top_k=5)
    chat_engine = index.as_chat_engine(chat_mode="condense_question")
    print("Index cleared and reset.")

In [16]:
%pwd

'/Users/ericmurray/repos/ai-agents-crash-course/notebooks_eric'

In [27]:
add_directory('./data/')

2025-12-06 11:02:58,799 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Added 5 documents from ./data/


In [28]:
# reset()

In [29]:
get_stats()

{'total_chunks': 5}

In [34]:
def search(query: str, top_k: int = 5):
    """Search for similar documents."""
    retriever = index.as_retriever(similarity_top_k=top_k)
    nodes = retriever.retrieve(query)
    
    print(f"Query: {query}\n")
    for i, node in enumerate(nodes):
        print(f"--- Result {i+1} (score: {node.score:.4f}) ---")
        print(f"Source: {node.metadata.get('source', 'Unknown')}")
        print(f"Preview: {node.text[:]}...\n")
    
    return nodes

In [35]:
search("where is Eric go to college?", 1)

2025-12-06 11:05:40,658 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Query: where is Eric go to college?

--- Result 1 (score: 0.2748) ---
Source: Unknown
Preview: 2004 - October 2012 (8 years)
Chantilly, Virginia
Responsible for expanding business with existing customers and contracts.
Managed all other government agency contracts, Technical Lead  for satellite
software  development projects, and lead for company IRAD/internal projects.  
AGI
Systems Engineer
December 2003 - December 2004 (1 year 1 month)
Pre and post sales responsibility for various government agencies. 
BAE Systems
Modeling and Simualtions Engineer
December 2002 - December 2003 (1 year 1 month)
Modeling and Simulation development and projects using discrete event
simulators, STK and MS Office tools. 
Adroit
Engineer
2000 - 2002 (2 years)
Technical lead on R&D project supporting Air Force collection and exploitation
systems
Education
University of Virginia
BS, Mechanical Engineering · (1996 - 2000)
North Carolina State University
Masters of Engineering  · (2002 - 2005)
  Page 3 of 3..

[NodeWithScore(node=TextNode(id_='b99ce0a3-baa2-46a1-835c-18a16620ade7', embedding=None, metadata={'page_label': '3', 'file_name': 'linkedin profile.pdf', 'file_path': '/Users/ericmurray/repos/ai-agents-crash-course/notebooks_eric/data/linkedin profile.pdf', 'file_type': 'application/pdf', 'file_size': 58940, 'creation_date': '2025-08-19', 'last_modified_date': '2025-08-19'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d20398df-6132-41c8-9428-178af3efa65e', node_type='4', metadata={'page_label': '3', 'file_name': 'linkedin profile.pdf', 'file_path': '/Users/ericmurray/repos/ai-agents-crash-course/notebooks_eric/data/linkedin profile.pdf', 'file_type': 'application/pdf', 'file_size': 58940, 'creation_dat