# ChromaDB Query Testing
Testing semantic keyword generation and vector DB query results

In [None]:
import chromadb
from chromadb.utils import embedding_functions
import os
import json
from typing import List, Dict

In [None]:
# Set OpenAI API key from environment
# Make sure to set OPENAI_API_KEY in your environment or .env.local file
# Example: export OPENAI_API_KEY="your-key-here"

import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv('.env.local')

if not os.getenv('OPENAI_API_KEY'):
    raise ValueError("OPENAI_API_KEY not found in environment. Please set it in .env.local")

# Connect to ChromaDB
chroma_client = chromadb.PersistentClient(path="./data/chroma_db")
embedding_function = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.environ['OPENAI_API_KEY'],
    model_name="text-embedding-3-small"
)

collection = chroma_client.get_collection(
    name="telegram_activities",
    embedding_function=embedding_function
)

print(f"✅ Connected to collection: {collection.name}")
print(f"📊 Total activities: {collection.count()}")

In [None]:
# Semantic keyword builder (matching keywords.ts logic)
def build_semantic_keywords(params: Dict) -> str:
    """Build semantic keywords from user preferences"""
    keywords = []
    
    # Add user query
    if params.get('query'):
        keywords.append(params['query'])
    
    # Add activity types
    if params.get('activities'):
        keywords.extend([a.lower() for a in params['activities']])
    
    # Budget mapping
    budget_keywords = {
        0: ['free', 'cheap', 'budget-friendly', 'under $30'],
        1: ['affordable', 'budget', 'economical', '$30-50'],
        2: ['moderate', 'mid-range', '$50-75'],
        3: ['comfortable', 'premium', 'upscale', '$75-100'],
        4: ['luxury', 'high-end', 'expensive', '$100+']
    }
    budget = params.get('budget', 2)
    if budget in budget_keywords:
        keywords.extend(budget_keywords[budget][:2])  # Take first 2
    
    # Group size mapping
    pax_keywords = {
        'solo': ['solo', 'alone', 'individual', 'self'],
        'date': ['romantic', 'couple', 'date', 'intimate'],
        'friends': ['group', 'friends', 'social', 'gathering'],
        'family': ['family', 'kids', 'children', 'all-ages']
    }
    num_pax = params.get('numPax', '').lower()
    if num_pax in pax_keywords:
        keywords.extend(pax_keywords[num_pax][:2])
    
    # MBTI personality traits
    mbti_keywords = {
        'INTJ': ['strategic', 'independent', 'intellectual'],
        'INTP': ['analytical', 'curious', 'theoretical'],
        'ENTJ': ['leadership', 'ambitious', 'organized'],
        'ENTP': ['innovative', 'debate', 'entrepreneurial'],
        'INFJ': ['meaningful', 'creative', 'idealistic'],
        'INFP': ['authentic', 'artistic', 'introspective'],
        'ENFJ': ['inspiring', 'empathetic', 'community'],
        'ENFP': ['creative', 'spontaneous', 'social', 'adventurous', 'enthusiastic'],
        'ISTJ': ['traditional', 'reliable', 'structured'],
        'ISFJ': ['caring', 'detail-oriented', 'supportive'],
        'ESTJ': ['organized', 'practical', 'efficient'],
        'ESFJ': ['social', 'warm', 'cooperative'],
        'ISTP': ['hands-on', 'practical', 'flexible'],
        'ISFP': ['artistic', 'gentle', 'present-moment'],
        'ESTP': ['energetic', 'action-oriented', 'bold'],
        'ESFP': ['entertaining', 'spontaneous', 'fun-loving']
    }
    mbti = params.get('mbti', '').upper()
    if mbti in mbti_keywords:
        keywords.extend(mbti_keywords[mbti])
    
    # Spicy mode (nightlife)
    if params.get('spicy'):
        keywords.extend(['nightlife', 'drinks', 'bars', 'clubs', 'evening'])
    
    return ' '.join(keywords)

print("✅ Semantic keyword builder ready")

In [None]:
def query_and_display(user_params: Dict, n_results: int = 10):
    """Query ChromaDB and display results"""
    
    # Build semantic query
    semantic_query = build_semantic_keywords(user_params)
    
    print("\n" + "="*80)
    print("🔍 USER QUERY")
    print("="*80)
    print(f"Original query: {user_params.get('query', 'N/A')}")
    print(f"Activities: {user_params.get('activities', [])}")
    print(f"Budget: {user_params.get('budget', 2)} (0=Broke, 2=Moderate, 4=Baller)")
    print(f"Group: {user_params.get('numPax', 'N/A')}")
    print(f"MBTI: {user_params.get('mbti', 'N/A')}")
    print(f"Spicy: {user_params.get('spicy', False)}")
    
    print("\n" + "-"*80)
    print("🔑 SEMANTIC KEYWORDS GENERATED:")
    print("-"*80)
    print(f"{semantic_query}")
    
    # Query ChromaDB
    results = collection.query(
        query_texts=[semantic_query],
        n_results=n_results
    )
    
    print("\n" + "-"*80)
    print(f"📊 RESULTS: Found {len(results['ids'][0])} activities")
    print("-"*80)
    
    # Display results
    for i in range(len(results['ids'][0])):
        metadata = results['metadatas'][0][i]
        distance = results['distances'][0][i] if 'distances' in results else None
        
        if metadata and 'full_data' in metadata:
            full_data = json.loads(metadata['full_data'])
            
            print(f"\n{i+1}. {full_data.get('title', 'Untitled')}")
            if distance is not None:
                print(f"   Distance: {distance:.4f}")
            print(f"   Description: {full_data.get('description', 'N/A')[:150]}...")
            print(f"   Location: {full_data.get('location', 'N/A')}")
            print(f"   Venue: {full_data.get('venue_name', 'N/A')}")
            print(f"   Price: ${full_data.get('price', 'N/A')}")
            print(f"   Tags: {', '.join(full_data.get('tags', [])[:5])}")
            print(f"   Source: {full_data.get('source_channel', 'N/A')}")
    
    print("\n" + "="*80 + "\n")
    return results

## Test Case 1: Romantic Date
Budget-conscious couple looking for a romantic experience

In [None]:
test_1 = {
    'query': 'romantic dinner date with creative twist',
    'activities': ['Cafes', 'Museums'],
    'budget': 2,  # Moderate
    'numPax': 'date',
    'mbti': 'ENFP',
    'spicy': False
}

results_1 = query_and_display(test_1, n_results=10)

## Test Case 2: Budget-Friendly Solo Activity
Introvert looking for affordable solo activities

In [None]:
test_2 = {
    'query': 'Chill activities for introverts under $30',
    'activities': ['Parks', 'Cafes'],
    'budget': 0,  # Broke
    'numPax': 'solo',
    'mbti': 'INFP',
    'spicy': False
}

results_2 = query_and_display(test_2, n_results=10)

## Test Case 3: Luxury Group Experience
High-budget group looking for premium experiences

In [None]:
test_3 = {
    'query': 'Luxury dining and entertainment for special celebration',
    'activities': ['Fine Dining', 'Bars'],
    'budget': 4,  # Baller
    'numPax': 'friends',
    'mbti': 'ENTJ',
    'spicy': True
}

results_3 = query_and_display(test_3, n_results=10)

## Test Case 4: Family-Friendly Day Out
Family looking for kid-friendly activities

In [None]:
test_4 = {
    'query': 'Fun family day out with activities for kids',
    'activities': ['Parks', 'Museums', 'Attractions'],
    'budget': 2,  # Moderate
    'numPax': 'family',
    'mbti': 'ESFJ',
    'spicy': False
}

results_4 = query_and_display(test_4, n_results=10)

## Test Case 5: Adventure Seeker
Looking for unique and adventurous experiences

In [None]:
test_5 = {
    'query': 'Unique outdoor adventures and thrilling experiences',
    'activities': ['Sports', 'Outdoors'],
    'budget': 3,  # Comfortable
    'numPax': 'friends',
    'mbti': 'ESTP',
    'spicy': False
}

results_5 = query_and_display(test_5, n_results=10)

## Test Case 6: Cultural Experience
Looking for art, culture, and intellectual experiences

In [None]:
test_6 = {
    'query': 'Art galleries, cultural sites, and intellectual cafes',
    'activities': ['Museums', 'Art', 'Cafes'],
    'budget': 2,  # Moderate
    'numPax': 'solo',
    'mbti': 'INTJ',
    'spicy': False
}

results_6 = query_and_display(test_6, n_results=10)

## Analysis: Keyword Effectiveness
Compare results with and without semantic enrichment

In [None]:
# Test with just the raw query (no semantic enrichment)
raw_query = "romantic dinner date with creative twist"
print("\n" + "="*80)
print("🔍 RAW QUERY (No Semantic Enrichment)")
print("="*80)
print(f"Query: {raw_query}\n")

raw_results = collection.query(
    query_texts=[raw_query],
    n_results=5
)

print("Top 5 Results (Raw Query):")
for i in range(len(raw_results['ids'][0])):
    metadata = raw_results['metadatas'][0][i]
    if metadata and 'full_data' in metadata:
        full_data = json.loads(metadata['full_data'])
        print(f"{i+1}. {full_data.get('title', 'Untitled')}")

# Now test with semantic enrichment
enriched_query = build_semantic_keywords(test_1)
print("\n" + "="*80)
print("🔑 ENRICHED QUERY (With Semantic Keywords)")
print("="*80)
print(f"Query: {enriched_query}\n")

enriched_results = collection.query(
    query_texts=[enriched_query],
    n_results=5
)

print("Top 5 Results (Enriched Query):")
for i in range(len(enriched_results['ids'][0])):
    metadata = enriched_results['metadatas'][0][i]
    if metadata and 'full_data' in metadata:
        full_data = json.loads(metadata['full_data'])
        print(f"{i+1}. {full_data.get('title', 'Untitled')}")

print("\n" + "="*80)