In [1]:
# Enhanced extraction script with full bird metadata
import pandas as pd
import numpy as np
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import librosa
import pickle

# Load model (once)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base", force_download=True)
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base", force_download=True)

# Load your clips info
clips_df = pd.read_csv("extracted_clips_info.csv")

features_list = []

for _, row in clips_df.iterrows():  # Fixed syntax
    print(f"Processing bird_id {row['bird_id']}: {row['species_name']}")
    
    # Load 10s audio clip
    audio, sr = librosa.load(row['clip_path'], sr=16000)
    
    # Get wav2vec features
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        features = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
    
    # Save ALL metadata including bird_id
    features_list.append({
        'bird_id': int(row['bird_id']),           # Important: Bird ID
        'species_name': row['species_name'],      # Species name
        'original_file_id': row['original_file_id'], # Original xc file
        'clip_path': row['clip_path'],            # Path to processed clip
        'features': features,                     # 768D wav2vec vector
        'feature_shape': features.shape,          # Vector shape info
        'clip_duration': row.get('clip_duration', 10.0)  # Duration
    })

# Save features with all metadata
with open('bird_audio_features.pkl', 'wb') as f:
    pickle.dump(features_list, f)

print(f"✅ Extracted {len(features_list)} feature vectors with bird_ids!")

# Quick verification
print(f"\n📊 Bird IDs saved: {[item['bird_id'] for item in features_list[:5]]}")

  from .autonotebook import tqdm as notebook_tqdm
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00,  4.89it/s]


Processing bird_id 0: African Pied Wagtail
Processing bird_id 1: Barn Swallow
Processing bird_id 2: Black Woodpecker
Processing bird_id 3: Black-headed Gull
Processing bird_id 4: Canada Goose
Processing bird_id 5: Carrion Crow
Processing bird_id 6: Coal Tit
Processing bird_id 7: Common Blackbird
Processing bird_id 8: Common Chaffinch
Processing bird_id 9: Common Chiffchaff
Processing bird_id 10: Common Cuckoo
Processing bird_id 11: Common House Martin
Processing bird_id 12: Common Linnet
Processing bird_id 13: Common Moorhen
Processing bird_id 14: Common Nightingale
Processing bird_id 15: Common Pheasant
Processing bird_id 16: Common Redpoll
Processing bird_id 17: Common Redshank
Processing bird_id 18: Common Redstart
Processing bird_id 19: Common Reed Bunting
Processing bird_id 20: Common Snipe
Processing bird_id 21: Common Starling
Processing bird_id 22: Common Swift
Processing bird_id 23: Common Whitethroat
Processing bird_id 24: Common Wood Pigeon
Processing bird_id 25: Corn Buntin

In [2]:
import pickle
import numpy as np

# Quick peek at your features
with open('bird_audio_features.pkl', 'rb') as f:
    features_list = pickle.load(f)

summary = []
for item in features_list:
    summary.append({
        'bird_id': item['bird_id'],
        'species': item['species_name'],
        'features': item['features'].mean(),
        'std': item['features'].std(),
        'min': item['features'].min(),
        'max': item['features'].max()
    })

df = pd.DataFrame(summary)
print(df.head(10))


   bird_id               species  features       std       min       max
0        0  African Pied Wagtail -0.002588  0.263439 -1.784976  1.311557
1        1          Barn Swallow -0.004445  0.246912 -1.558102  1.120933
2        2      Black Woodpecker -0.000574  0.284235 -1.687708  2.230798
3        3     Black-headed Gull -0.003947  0.295093 -1.759302  2.278882
4        4          Canada Goose  0.004299  0.288786 -1.927482  1.702550
5        5          Carrion Crow  0.004574  0.309438 -2.009103  1.471438
6        6              Coal Tit -0.003273  0.287107 -1.725532  2.098139
7        7      Common Blackbird -0.006315  0.250198 -2.196811  1.325279
8        8      Common Chaffinch -0.003620  0.299238 -2.071028  2.078463
9        9     Common Chiffchaff -0.006822  0.279295 -2.608278  2.715713


In [3]:
# Check for issues
print("🔍 QUALITY CHECK:")

# All same dimension?
dims = [len(item['features']) for item in features_list]
print(f"   All 768D? {all(d == 768 for d in dims)}")

# Any NaN values?
has_nan = any(np.isnan(item['features']).any() for item in features_list)
print(f"   Has NaN? {has_nan}")

# All zeros?
all_zero = any(np.all(item['features'] == 0) for item in features_list)
print(f"   Any all-zero? {all_zero}")

print(f"   ✅ Features look good!" if not has_nan and not all_zero else "❌ Issues found!")

🔍 QUALITY CHECK:
   All 768D? True
   Has NaN? False
   Any all-zero? False
   ✅ Features look good!


In [2]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import pickle
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Load features with bird IDs
with open('features/bird_audio_features.pkl', 'rb') as f:
    features_list = pickle.load(f)

# Connect to Qdrant Cloud instead of localhost
client = QdrantClient(
    url=os.getenv('QDRANT_ENDPOINT'),
    api_key=os.getenv('QDRANT_API_KEY'),
)

# Create collection
collection_name = "bird_audio_search"
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE)
)

# Upload with bird_id as point ID for easy retrieval
points = []
for item in features_list:
    point = PointStruct(
        id=item['bird_id'],  # Use bird_id as Qdrant point ID
        vector=item['features'].tolist(),
        payload={
            "bird_id": item['bird_id'],
            "species_name": item['species_name'],
            "original_file_id": item['original_file_id'],
            "clip_path": item['clip_path'],
            "feature_type": "wav2vec2",
            "clip_duration": item['clip_duration']
        }
    )
    points.append(point)

# Upload to Qdrant
client.upsert(collection_name=collection_name, points=points)

print(f"✅ Uploaded {len(points)} birds to Qdrant Cloud")

# Test search by bird_id
def search_similar_birds(bird_id, limit=5):
    target_bird = next(item for item in features_list if item['bird_id'] == bird_id)
    
    results = client.search(
        collection_name=collection_name,
        query_vector=target_bird['features'].tolist(),
        limit=limit
    )
    
    print(f"\n🔍 Birds similar to ID {bird_id} ({target_bird['species_name']}):")
    for result in results:
        print(f"   ID {result.id}: {result.payload['species_name']} (score: {result.score:.3f})")

# Test
search_similar_birds(bird_id=0)

✅ Uploaded 88 birds to Qdrant Cloud

🔍 Birds similar to ID 0 (African Pied Wagtail):
   ID 0: African Pied Wagtail (score: 1.000)
   ID 85: Wood Sandpiper (score: 0.802)
   ID 20: Common Snipe (score: 0.798)
   ID 35: Eurasian Magpie (score: 0.779)
   ID 56: Great Spotted Woodpecker (score: 0.728)


  results = client.search(
