In [1]:
# # ChromaDB Querying Guide

# This notebook demonstrates how to query ChromaDB for Disney customer reviews using the VectorStoreManager.

# ## Overview

# ChromaDB is a vector database that stores embeddings of Disney reviews, allowing for semantic search and similarity queries. The VectorStoreManager provides several methods for querying:

# - `get_relevant_context()` - Find relevant reviews for a query
# - `search_similar()` - Search for similar documents
# - `get_collection_stats()` - Get database statistics
# - `add_documents()` - Add new documents to the database

In [5]:
!uv pip install chromadb

[2mUsing Python 3.11.13 environment at: /app/.venv[0m
[2K[2mResolved [1m85 packages[0m [2min 4.89s[0m[0m                                        [0m
[2K[2mPrepared [1m45 packages[0m [2min 23.22s[0m[0m                                           
[2K[2mInstalled [1m52 packages[0m [2min 44ms[0m[0m                               [0m
 [32m+[39m [1mbackoff[0m[2m==2.2.1[0m
 [32m+[39m [1mbcrypt[0m[2m==4.3.0[0m
 [32m+[39m [1mbuild[0m[2m==1.3.0[0m
 [32m+[39m [1mcachetools[0m[2m==5.5.2[0m
 [32m+[39m [1mchromadb[0m[2m==1.1.0[0m
 [32m+[39m [1mclick[0m[2m==8.3.0[0m
 [32m+[39m [1mcoloredlogs[0m[2m==15.0.1[0m
 [32m+[39m [1mdurationpy[0m[2m==0.10[0m
 [32m+[39m [1mflatbuffers[0m[2m==25.2.10[0m
 [32m+[39m [1mgoogle-auth[0m[2m==2.40.3[0m
 [32m+[39m [1mgoogleapis-common-protos[0m[2m==1.70.0[0m
 [32m+[39m [1mgrpcio[0m[2m==1.75.0[0m
 [32m+[39m [1mhttptools[0m[2m==0.6.4[0m
 [32m+[39m [1mhumanfriendly[0m[2m

In [6]:
# Import required libraries
import pandas as pd
import asyncio
from typing import List, Dict, Any
import sys
import os

# Add src to path for imports
sys.path.append('/app/src')

# Import our VectorStoreManager
from disney.rag.vector_store_manager import VectorStoreManager
from disney.shared.config import settings

print("✅ Imports successful!")


  from .autonotebook import tqdm as notebook_tqdm


✅ Imports successful!


## 1. Initialize VectorStoreManager

Connect to ChromaDB and initialize the vector store manager.


In [14]:
from disney.shared.config import settings

vector_manager = VectorStoreManager(chroma_host = "chromadb" , chroma_port = 8000)
# print(vector_manager.chroma_host)
# print(f"✅ Connected to ChromaDB at {vector_manager.chroma_host}:{vector_manager.chroma_port}")
# print(f"📚 Collection: {vector_manager.collection_name}")


2025-09-23 20:17:27 - vector-store-manager - INFO - Vector store initialized with collection: disney_reviews
2025-09-23 20:17:27 - vector-store-manager - INFO - VectorStoreManager initialized with ChromaDB at chromadb:8000


  self.vector_store = Chroma(


## 2. Get Collection Statistics

Check what data is available in the database.


In [15]:
# Get collection statistics
async def get_stats():
    stats = await vector_manager.get_collection_stats()
    return stats

# Run the async function
stats = await get_stats()
print("📊 Collection Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")


📊 Collection Statistics:
  collection_name: disney_reviews
  document_count: 87512
  last_updated: 2025-09-23T20:17:35.957087
  embedding_model: all-MiniLM-L6-v2


## 3. Basic Query - Find Relevant Context

Search for reviews related to a specific question or topic.


In [17]:
# Basic query function
async def search_reviews(query: str, n_results: int = 5):
    """Search for relevant reviews."""
    results = await vector_manager.get_relevant_context(
        query=query,
        n_results=n_results,
        similarity_threshold=0.07,
        max_context_length=2000
    )
    return results

# Example queries
queries = [
    "What do customers say about Space Mountain?",
    "How are the wait times at Disneyland?",
    "What do people think about the food?",
    "Customer complaints about cleanliness"
]

print("🔍 Testing basic queries:")
for i, query in enumerate(queries, 1):
    print(f"\n{i}. Query: {query}")
    results = await search_reviews(query, n_results=3)
    print(f"   Found {len(results)} relevant reviews")
    
    for j, result in enumerate(results[:2], 1):  # Show first 2 results
        print(f"   {j}. Relevance: {result.get('relevance_score', 0):.3f}")
        print(f"      Content: {result.get('content', '')[:100]}...")
        print(f"      Metadata: {result.get('metadata', {})}")


🔍 Testing basic queries:

1. Query: What do customers say about Space Mountain?
2025-09-23 20:18:17 - vector-store-manager - INFO - Retrieving context for query: What do customers say about Space Mountain?...
2025-09-23 20:18:17 - vector-store-manager - INFO - Retrieved 3 relevant documents
   Found 3 relevant reviews
   1. Relevance: 0.379
      Content: It's cheesy and all, but it really is magical. The staff are warm and friendly in the midst of chaos...
      Metadata: {'year': '2011', 'rating': 5, 'branch': 'Disneyland_California', 'original_index': 27778}
   2. Relevance: 0.379
      Content: It's cheesy and all, but it really is magical. The staff are warm and friendly in the midst of chaos...
      Metadata: {'branch': 'Disneyland_California', 'year': '2011', 'original_index': 27778, 'rating': 5}

2. Query: How are the wait times at Disneyland?
2025-09-23 20:18:17 - vector-store-manager - INFO - Retrieving context for query: How are the wait times at Disneyland?...
2025-09-23 2

## 4. Similarity Search

Find documents similar to a specific text or review.


In [None]:
# Similarity search function
async def find_similar(text: str, n_results: int = 5):
    """Find documents similar to the given text."""
    results = await vector_manager.search_similar(
        query_text=text,
        n_results=n_results,
        similarity_threshold=0.6
    )
    return results

# Example similarity searches
similarity_queries = [
    "The ride was amazing and worth the wait",
    "Long lines and expensive food",
    "Great experience for families with kids"
]

print("🔍 Testing similarity search:")
for i, query in enumerate(similarity_queries, 1):
    print(f"\n{i}. Similar to: '{query}'")
    results = await find_similar(query, n_results=3)
    print(f"   Found {len(results)} similar reviews")
    
    for j, result in enumerate(results[:2], 1):
        print(f"   {j}. Similarity: {result.get('similarity_score', 0):.3f}")
        print(f"      Content: {result.get('content', '')[:100]}...")


## 5. Export Results to DataFrame

Convert query results to pandas DataFrame for further analysis.


In [None]:
# Export function
async def export_query_results(query: str, n_results: int = 50) -> pd.DataFrame:
    """Export query results to a pandas DataFrame."""
    results = await search_reviews(query, n_results=n_results)
    
    if not results:
        return pd.DataFrame()
    
    # Flatten the results
    flattened = []
    for result in results:
        row = {
            'content': result.get('content', ''),
            'relevance_score': result.get('relevance_score', 0),
            'id': result.get('id', ''),
        }
        
        # Add metadata fields
        metadata = result.get('metadata', {})
        for key, value in metadata.items():
            row[f'metadata_{key}'] = value
        
        flattened.append(row)
    
    return pd.DataFrame(flattened)

# Export results for analysis
print("📊 Exporting query results to DataFrame:")

# Export Space Mountain reviews
space_mountain_df = await export_query_results("Space Mountain", n_results=20)
print(f"\nSpace Mountain reviews: {len(space_mountain_df)} rows")
if not space_mountain_df.empty:
    print("\nColumns:", space_mountain_df.columns.tolist())
    print("\nFirst few rows:")
    print(space_mountain_df[['content', 'relevance_score', 'metadata_rating']].head())
    
    # Basic statistics
    if 'metadata_rating' in space_mountain_df.columns:
        print(f"\nRating statistics:")
        print(space_mountain_df['metadata_rating'].describe())
        print(f"\nRating distribution:")
        print(space_mountain_df['metadata_rating'].value_counts().sort_index())


## Summary

This notebook demonstrates various ways to query ChromaDB:

1. **Basic queries** - Find relevant reviews for any topic
2. **Similarity search** - Find documents similar to specific text
3. **Data export** - Convert results to pandas DataFrame

### Key Methods:
- `get_relevant_context()` - Main search method
- `search_similar()` - Similarity search
- `get_collection_stats()` - Database statistics

### Parameters:
- `query` - Search text
- `n_results` - Number of results to return
- `similarity_threshold` - Minimum similarity score (0-1)
- `max_context_length` - Maximum total context length

### Next Steps:
- Experiment with different queries
- Try different similarity thresholds
- Create your own analysis functions
- Export data for visualization
