In [1]:
import json
import pickle
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def json_to_text(bird_data):
    """Convert entire bird JSON to searchable text"""
    def extract_text_from_dict(d, prefix=""):
        """Recursively extract all text from nested dictionaries"""
        text_parts = []
        
        if isinstance(d, dict):
            for key, value in d.items():
                if isinstance(value, str) and value.strip() and value.lower() not in ['unknown', 'not specified', '']:
                    text_parts.append(f"{key}: {value}")
                elif isinstance(value, list):
                    # Handle lists
                    list_items = [str(item) for item in value if str(item).strip() and str(item).lower() not in ['unknown', 'not specified', '']]
                    if list_items:
                        text_parts.append(f"{key}: {', '.join(list_items)}")
                elif isinstance(value, dict):
                    # Recursively process nested dicts
                    nested_text = extract_text_from_dict(value, f"{prefix}{key}_")
                    if nested_text:
                        text_parts.extend(nested_text)
                elif isinstance(value, (int, float)) and value != 0:
                    text_parts.append(f"{key}: {value}")
        
        return text_parts
    
    # Extract all text from the bird data
    all_text = extract_text_from_dict(bird_data)
    return ". ".join(all_text)

def create_json_embeddings(json_file: str, output_file: str = 'bird_json_embeddings.pkl'):
    """Convert JSON bird data to embeddings using full JSON content"""
    
    # Initialize OpenAI client
    api_key = os.getenv('OPENAI_API_KEY')
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in environment variables")
    
    client = OpenAI(api_key=api_key)
    
    # Load JSON data
    with open(json_file, 'r', encoding='utf-8') as f:
        bird_database = json.load(f)
    
    print(f"Processing {len(bird_database)} birds from JSON...")
    
    # Prepare data structures
    bird_data = {
        'bird_ids': [],
        'species_names': [],
        'features': [],
        'metadata': []
    }
    
    for idx, (species_name, bird_info) in enumerate(bird_database.items()):
        # Skip failed extractions
        if not bird_info.get('processing_success', True):
            continue
        
        print(f"Processing {idx+1}/{len(bird_database)}: {species_name}")
        
        # Convert entire JSON to searchable text
        full_text = json_to_text(bird_info)
        
        # Add species name at the beginning for emphasis
        searchable_text = f"Species: {species_name}. {full_text}"
        
        try:
            # Generate embedding from full text
            response = client.embeddings.create(
                model="text-embedding-3-small",
                input=searchable_text[:8000]  # Truncate if too long
            )
            embedding = np.array(response.data[0].embedding)
            
            # Extract basic metadata for Qdrant payload
            bird_profile = bird_info.get('bird_profile', {})
            
            metadata = {
                'species_name': species_name,
                'scientific_name': bird_profile.get('scientific_name', 'Unknown'),
                'family': bird_profile.get('family', 'Unknown'),
                'size': bird_profile.get('size', 'medium'),
                'full_json': bird_info,  # Store entire JSON for complete data access
                'searchable_text': searchable_text[:1000],  # Truncated version for display
                'data_completeness': len(full_text),  # Measure of how much data we have
                'processing_success': bird_info.get('processing_success', True)
            }
            
            # Try to extract bird_id
            bird_id = bird_profile.get('bird_id', idx)
            
            # Store data
            bird_data['bird_ids'].append(bird_id)
            bird_data['species_names'].append(species_name)
            bird_data['features'].append(embedding)
            bird_data['metadata'].append(metadata)
            
            print(f"  ✓ Embedded {len(searchable_text)} characters of text")
            
        except Exception as e:
            print(f"  ✗ Failed to process {species_name}: {e}")
            continue
    
    # Convert features to numpy array
    bird_data['features'] = np.array(bird_data['features'])
    
    # Save to pickle
    with open(output_file, 'wb') as f:
        pickle.dump(bird_data, f)
    
    print(f"\nSaved {len(bird_data['bird_ids'])} birds to {output_file}")
    print(f"Feature dimensions: {bird_data['features'].shape}")
    
    # Show some examples
    print(f"\nSample processed data:")
    for i in range(min(3, len(bird_data['species_names']))):
        metadata = bird_data['metadata'][i]
        print(f"{i+1}. {bird_data['species_names'][i]}")
        print(f"   Scientific: {metadata['scientific_name']}")
        print(f"   Text length: {metadata['data_completeness']} chars")
        print(f"   Sample text: {metadata['searchable_text'][:200]}...")
    
    return bird_data

def estimate_cost(json_file: str):
    """Estimate OpenAI cost for processing the JSON"""
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    valid_birds = sum(1 for bird_info in data.values() 
                     if bird_info.get('processing_success', True))
    
    # Estimate tokens (rough: 1000 chars = ~250 tokens)
    avg_chars_per_bird = 2000  # Conservative estimate
    total_tokens = valid_birds * (avg_chars_per_bird / 4)  # 4 chars per token
    cost = (total_tokens / 1_000_000) * 0.02
    
    print(f"\nEstimated cost: ~${cost:.2f} for {valid_birds} birds")
    print(f"Estimated tokens: {total_tokens:,.0f}")

if __name__ == "__main__":
    json_file = 'bird_database.json'
    
    # Check if file exists
    if not os.path.exists(json_file):
        print(f"JSON file not found: {json_file}")
        print("Please ensure the file exists in the current directory")
        exit(1)
    
    # Estimate cost first
    estimate_cost(json_file)
    
    # Process the data
    print("\nProcessing JSON to embeddings...")
    bird_data = create_json_embeddings(json_file, 'bird_json_embeddings.pkl')
    
    print("\nJSON to embeddings conversion complete!")
    print("Ready to upload to Qdrant with full JSON data preserved.")


Estimated cost: ~$0.00 for 88 birds
Estimated tokens: 44,000

Processing JSON to embeddings...
Processing 88 birds from JSON...
Processing 1/88: African Pied Wagtail
  ✓ Embedded 1017 characters of text
Processing 2/88: Barn Swallow
  ✓ Embedded 1429 characters of text
Processing 3/88: Black Woodpecker
  ✓ Embedded 1327 characters of text
Processing 4/88: Black-headed Gull
  ✓ Embedded 607 characters of text
Processing 5/88: Canada Goose
  ✓ Embedded 1099 characters of text
Processing 6/88: Carrion Crow
  ✓ Embedded 935 characters of text
Processing 7/88: Coal Tit
  ✓ Embedded 1085 characters of text
Processing 8/88: Common Blackbird
  ✓ Embedded 1324 characters of text
Processing 9/88: Common Chaffinch
  ✓ Embedded 1673 characters of text
Processing 10/88: Common Chiffchaff
  ✓ Embedded 1345 characters of text
Processing 11/88: Common Cuckoo
  ✓ Embedded 1645 characters of text
Processing 12/88: Common House Martin
  ✓ Embedded 882 characters of text
Processing 13/88: Common Linnet
 

In [3]:
import pickle
import numpy as np
import json

def inspect_pickle_file(pickle_file: str):
    """Inspect the contents of a pickle file"""
    
    print(f"Inspecting: {pickle_file}")
    print("=" * 50)
    
    try:
        with open(pickle_file, 'rb') as f:
            data = pickle.load(f)
        
        print(f"File loaded successfully!")
        print(f"Data type: {type(data)}")
        
        if isinstance(data, dict):
            print(f"\nDictionary keys: {list(data.keys())}")
            
            for key, value in data.items():
                print(f"\n{key}:")
                print(f"  Type: {type(value)}")
                
                if isinstance(value, (list, np.ndarray)):
                    print(f"  Length: {len(value)}")
                    if len(value) > 0:
                        print(f"  First item type: {type(value[0])}")
                        if isinstance(value, np.ndarray):
                            print(f"  Shape: {value.shape}")
                            print(f"  Data type: {value.dtype}")
                
                # Show first few items for each key
                if key == 'bird_ids' and isinstance(value, (list, np.ndarray)):
                    print(f"  Sample values: {list(value[:5])}")
                elif key == 'species_names' and isinstance(value, list):
                    print(f"  Sample values: {value[:3]}")
                elif key == 'features' and isinstance(value, np.ndarray):
                    print(f"  First vector shape: {value[0].shape if len(value) > 0 else 'N/A'}")
                    print(f"  Sample values: {value[0][:5] if len(value) > 0 else 'N/A'}")
                elif key == 'metadata' and isinstance(value, list):
                    print(f"  Sample metadata keys: {list(value[0].keys()) if len(value) > 0 else 'N/A'}")
        
        elif isinstance(data, list):
            print(f"\nList with {len(data)} items")
            if len(data) > 0:
                print(f"First item type: {type(data[0])}")
                if isinstance(data[0], dict):
                    print(f"First item keys: {list(data[0].keys())}")
        
        # Show detailed view of first few records
        print(f"\n" + "=" * 50)
        print("DETAILED VIEW OF FIRST 2 RECORDS:")
        print("=" * 50)
        
        if isinstance(data, dict) and 'species_names' in data:
            # Standard format with separate arrays
            for i in range(min(2, len(data['species_names']))):
                print(f"\nRecord {i+1}:")
                print(f"  Bird ID: {data['bird_ids'][i]}")
                print(f"  Species: {data['species_names'][i]}")
                print(f"  Feature vector shape: {data['features'][i].shape}")
                print(f"  Metadata:")
                for key, value in data['metadata'][i].items():
                    if key == 'full_json':
                        print(f"    {key}: <Full JSON object - {len(str(value))} chars>")
                    elif isinstance(value, str) and len(value) > 100:
                        print(f"    {key}: {value[:100]}...")
                    else:
                        print(f"    {key}: {value}")
        
        elif isinstance(data, list):
            # List format
            for i in range(min(2, len(data))):
                print(f"\nRecord {i+1}:")
                for key, value in data[i].items():
                    if key == 'features':
                        print(f"  {key}: {type(value)} shape {value.shape if hasattr(value, 'shape') else len(value)}")
                    elif key == 'full_json':
                        print(f"  {key}: <Full JSON object - {len(str(value))} chars>")
                    elif isinstance(value, str) and len(value) > 100:
                        print(f"  {key}: {value[:100]}...")
                    else:
                        print(f"  {key}: {value}")
        
        return data
        
    except Exception as e:
        print(f"Error loading pickle file: {e}")
        return None

def compare_pickle_files(*pickle_files):
    """Compare multiple pickle files"""
    
    print("COMPARING PICKLE FILES:")
    print("=" * 50)
    
    for i, file in enumerate(pickle_files):
        print(f"\nFile {i+1}: {file}")
        try:
            with open(file, 'rb') as f:
                data = pickle.load(f)
            
            if isinstance(data, dict):
                print(f"  Keys: {list(data.keys())}")
                if 'features' in data:
                    print(f"  Records: {len(data.get('species_names', []))}")
                    print(f"  Feature shape: {data['features'].shape if hasattr(data['features'], 'shape') else 'N/A'}")
            elif isinstance(data, list):
                print(f"  List length: {len(data)}")
                if len(data) > 0 and isinstance(data[0], dict):
                    print(f"  Item keys: {list(data[0].keys())}")
                    
        except Exception as e:
            print(f"  Error: {e}")

if __name__ == "__main__":
    # List available pickle files
    import os
    print("Available pickle files:")
    for file in os.listdir('.'):
        if file.endswith('.pkl'):
            print(f"  - {file}")
    
    # Check features folder too
    if os.path.exists('features'):
        print("\nIn features/ folder:")
        for file in os.listdir('features'):
            if file.endswith('.pkl'):
                print(f"  - features/{file}")
    
    print("\n" + "=" * 50)
    
    # Inspect specific files
    files_to_check = [
        'bird_json_embeddings.pkl'
    ]
    
    for file in files_to_check:
        if os.path.exists(file):
            print(f"\n{'='*60}")
            inspect_pickle_file(file)
            print(f"{'='*60}")
        else:
            print(f"\nFile not found: {file}")

Available pickle files:
  - bird_json_embeddings.pkl

In features/ folder:
  - features/bird_audio_features.pkl
  - features/bird_image_features.pkl
  - features/bird_text_features.pkl


Inspecting: bird_json_embeddings.pkl
File loaded successfully!
Data type: <class 'dict'>

Dictionary keys: ['bird_ids', 'species_names', 'features', 'metadata']

bird_ids:
  Type: <class 'list'>
  Length: 88
  First item type: <class 'int'>
  Sample values: [0, 1, 2, 3, 4]

species_names:
  Type: <class 'list'>
  Length: 88
  First item type: <class 'str'>
  Sample values: ['African Pied Wagtail', 'Barn Swallow', 'Black Woodpecker']

features:
  Type: <class 'numpy.ndarray'>
  Length: 88
  First item type: <class 'numpy.ndarray'>
  Shape: (88, 1536)
  Data type: float64
  First vector shape: (1536,)
  Sample values: [ 0.02088789  0.01562061  0.06003569  0.01210908 -0.05038467]

metadata:
  Type: <class 'list'>
  Length: 88
  First item type: <class 'dict'>
  Sample metadata keys: ['species_name', 'scie

In [4]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import pickle
import os
from dotenv import load_dotenv
import numpy as np

# Load environment variables
load_dotenv()

def upload_json_embeddings():
    """Upload bird JSON embeddings to Qdrant Cloud"""
    
    # Load JSON embeddings file
    features_file = 'bird_json_embeddings.pkl'
    
    print(f"Loading JSON embeddings from {features_file}...")
    
    if not os.path.exists(features_file):
        print(f"File not found: {features_file}")
        return False
    
    try:
        with open(features_file, 'rb') as f:
            data = pickle.load(f)
    except Exception as e:
        print(f"Error loading file: {e}")
        return False
    
    # Extract data components
    bird_ids = data['bird_ids']
    species_names = data['species_names']
    features = data['features']
    metadata = data['metadata']
    
    print(f"Loaded {len(bird_ids)} JSON embedding records")
    
    # Connect to Qdrant Cloud
    client = QdrantClient(
        url=os.getenv('QDRANT_ENDPOINT'),
        api_key=os.getenv('QDRANT_API_KEY'),
    )
    
    print(f"Connected to Qdrant Cloud: {os.getenv('QDRANT_ENDPOINT')}")
    
    # Get feature dimensions
    feature_dim = features.shape[1]
    print(f"Feature dimensions: {feature_dim}")
    
    # Create collection
    collection_name = "bird_text_search"
    
    try:
        # Delete collection if it exists
        try:
            client.delete_collection(collection_name)
            print(f"Deleted existing collection: {collection_name}")
        except:
            pass
        
        # Create new collection
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=feature_dim, distance=Distance.COSINE)
        )
        print(f"Created collection: {collection_name}")
        
    except Exception as e:
        print(f"Error creating collection: {e}")
        return False
    
    # Prepare points for upload
    points = []
    
    for i in range(len(bird_ids)):
        # Clean metadata for JSON serialization
        clean_metadata = {}
        for key, value in metadata[i].items():
            if key == 'full_json':
                # Skip the full JSON for now to avoid payload size issues
                # You can access it from your local pickle file if needed
                continue
            elif isinstance(value, (np.integer, np.int64)):
                clean_metadata[key] = int(value)
            elif isinstance(value, (np.floating, np.float64)):
                clean_metadata[key] = float(value)
            elif isinstance(value, list):
                clean_metadata[key] = [str(item) for item in value]
            elif value is None:
                clean_metadata[key] = None
            else:
                clean_metadata[key] = str(value)
        
        # Create point with JSON-based metadata
        point = PointStruct(
            id=i,  # Use index as point ID
            vector=features[i].tolist(),
            payload={
                "bird_id": int(bird_ids[i]) if isinstance(bird_ids[i], (int, np.integer)) else bird_ids[i],
                "species_name": species_names[i],
                "feature_type": "openai_json_embedding",
                "model_used": "text-embedding-3-small",
                "embedding_source": "complete_json_data",
                **clean_metadata  # Include extracted metadata (without full_json)
            }
        )
        points.append(point)
    
    # Upload to Qdrant in batches
    batch_size = 100
    total_batches = (len(points) + batch_size - 1) // batch_size
    
    print(f"Uploading {len(points)} points in {total_batches} batches...")
    
    try:
        for i in range(0, len(points), batch_size):
            batch = points[i:i + batch_size]
            client.upsert(collection_name=collection_name, points=batch)
            print(f"Uploaded batch {i//batch_size + 1}/{total_batches} ({len(batch)} points)")
        
        print(f"Successfully uploaded {len(points)} JSON embeddings to Qdrant Cloud!")
        
    except Exception as e:
        print(f"Error uploading: {e}")
        return False
    
    # Test search functionality
    print("\nTesting search functionality...")
    
    try:
        # Get a sample vector for testing
        test_vector = features[0].tolist()
        test_bird_name = species_names[0]
        
        results = client.search(
            collection_name=collection_name,
            query_vector=test_vector,
            limit=5
        )
        
        print(f"\nBirds similar to {test_bird_name} (JSON-based semantic similarity):")
        for result in results:
            payload = result.payload
            print(f"  {payload['species_name']} (bird_id: {payload['bird_id']}, score: {result.score:.3f})")
            print(f"    Scientific: {payload.get('scientific_name', 'Unknown')}")
            print(f"    Family: {payload.get('family', 'Unknown')}")
            print(f"    Data completeness: {payload.get('data_completeness', 0)} chars")
        
    except Exception as e:
        print(f"Error testing search: {e}")
    
    # Collection summary
    try:
        collection_info = client.get_collection(collection_name)
        print(f"\nCollection Summary:")
        print(f"  Name: {collection_name}")
        print(f"  Points: {collection_info.points_count}")
        print(f"  Dimensions: {collection_info.config.params.vectors.size}")
        print(f"  Status: {collection_info.status}")
        
    except Exception as e:
        print(f"Error getting collection info: {e}")
    
    return True

if __name__ == "__main__":
    print("Bird JSON Embeddings Uploader for Qdrant Cloud")
    print("=" * 50)
    
    # Upload embeddings
    success = upload_json_embeddings()
    
    if success:
        print("\nJSON embeddings successfully uploaded!")
        print("\nYou now have all three modalities in Qdrant Cloud:")
        print("  - bird_audio_search (audio embeddings)")
        print("  - bird_image_search (image embeddings)")  
        print("  - bird_text_search (JSON-based semantic embeddings)")
        print("\nReady for multi-modal Streamlit demo!")
    else:
        print("Upload failed. Please check your files and credentials.")

Bird JSON Embeddings Uploader for Qdrant Cloud
Loading JSON embeddings from bird_json_embeddings.pkl...
Loaded 88 JSON embedding records
Connected to Qdrant Cloud: https://559d3608-530f-4a96-97f3-4bb6202a5fb9.us-west-1-0.aws.cloud.qdrant.io
Feature dimensions: 1536
Deleted existing collection: bird_text_search
Created collection: bird_text_search
Uploading 88 points in 1 batches...
Uploaded batch 1/1 (88 points)
Successfully uploaded 88 JSON embeddings to Qdrant Cloud!

Testing search functionality...

Birds similar to African Pied Wagtail (JSON-based semantic similarity):
  African Pied Wagtail (bird_id: 0, score: 1.000)
    Scientific: Motacilla aguimp
    Family: Motacillidae
    Data completeness: 986 chars
  Western Yellow Wagtail (bird_id: 81, score: 0.707)
    Scientific: Motacilla flava
    Family: Motacillidae
    Data completeness: 1199 chars
  Meadow Pipit (bird_id: 65, score: 0.628)
    Scientific: Anthus pratensis
    Family: Motacillidae
    Data completeness: 1060 chars


  results = client.search(
