In [1]:
from aind_data_access_api.document_db import MetadataDbClient
from aind_metadata_upgrader.upgrade import Upgrade
import pandas as pd

# Initialize clients for both databases
client_v1 = MetadataDbClient(
    host="api.allenneuraldynamics.org",
    database="metadata_index",
    collection="data_assets",
)

client_v2 = MetadataDbClient(
    host="api.allenneuraldynamics.org",
    database="metadata_index_v2",
    collection="data_assets",
)


In [2]:

# Get all asset names from v1 in private bucket for this project
print("Getting all assets from v1 database in private bucket...")
v1_pipeline = [
    {
        "$match": {
            "location": {"$regex": "^s3://aind-private-data-prod-o5171v"},
            "data_description.project_name": "Cognitive flexibility in patch foraging"
        }
    },
    {
        "$project": {
            "name": 1,
            "_id": 0
        }
    }
]

v1_results = client_v1.aggregate_docdb_records(v1_pipeline)
v1_names = [r['name'] for r in v1_results]
print(f"Found {len(v1_names)} assets in v1 database")

# Get all asset names from v2 in private bucket for this project
print("\nGetting all assets from v2 database in private bucket...")
v2_pipeline = [
    {
        "$match": {
            "location": {"$regex": "^s3://aind-private-data-prod-o5171v"},
            "data_description.project_name": "Cognitive flexibility in patch foraging"
        }
    },
    {
        "$project": {
            "name": 1,
            "_id": 0
        }
    }
]

v2_results = client_v2.aggregate_docdb_records(v2_pipeline)
v2_names = [r['name'] for r in v2_results]
print(f"Found {len(v2_names)} assets in v2 database")

Getting all assets from v1 database in private bucket...
Found 3612 assets in v1 database

Getting all assets from v2 database in private bucket...
Found 2791 assets in v2 database


In [3]:
# Create sets for comparison
v1_set = set(v1_names)
v2_set = set(v2_names)
all_names = v1_set.union(v2_set)

# Create DataFrame with all assets
import re

def extract_subject_id(name: str) -> str:
    """Extract subject ID from asset name (e.g., behavior_754571_2024-09-19 -> 754571)"""
    match = re.search(r'behavior_(\d+)_', name)
    return match.group(1) if match else ''

data = []
for name in sorted(all_names):
    data.append({
        'name': name,
        'subject_id': extract_subject_id(name),
        'in_v1': name in v1_set,
        'in_v2': name in v2_set
    })

df = pd.DataFrame(data)

# Save to CSV
csv_filename = 'vr_foraging_v1_v2_comparison.csv'
df.to_csv(csv_filename, index=False)
print(f"\nSaved comparison to '{csv_filename}'")

# Print summary statistics
print(f"\n=== SUMMARY ===")
print(f"Total unique assets: {len(all_names)}")
print(f"Assets in v1 only: {len(df[df['in_v1'] & ~df['in_v2']])}")
print(f"Assets in v2 only: {len(df[~df['in_v1'] & df['in_v2']])}")
print(f"Assets in both v1 and v2: {len(df[df['in_v1'] & df['in_v2']])}")



Saved comparison to 'vr_foraging_v1_v2_comparison.csv'

=== SUMMARY ===
Total unique assets: 3612
Assets in v1 only: 821
Assets in v2 only: 0
Assets in both v1 and v2: 2791


In [4]:
df

Unnamed: 0,name,subject_id,in_v1,in_v2
0,behavior_123456_2025-04-15_01-17-37,123456,True,False
1,behavior_716455_2025-01-28_18-52-39,716455,True,True
2,behavior_716455_2025-01-29_19-28-24,716455,True,True
3,behavior_716455_2025-02-12_01-01-23,716455,True,True
4,behavior_716455_2025-02-12_02-33-04,716455,True,True
...,...,...,...,...
3607,behavior_828422_2025-11-05_17-17-12,828422,True,True
3608,behavior_828422_2025-11-06_16-59-53,828422,True,True
3609,behavior_828422_2025-11-07_17-12-02,828422,True,True
3610,behavior_828422_2025-11-10_17-10-06,828422,True,True


In [5]:
import sys
import os
from contextlib import redirect_stdout, redirect_stderr
from io import StringIO

def get_failure_reason(asset_name: str) -> str:
    """
    Determine why an asset failed to upgrade.
    
    Returns:
        str: Reason for failure or 'unknown' if can't determine
    """
    # Get the record from v1
    pipeline = [{"$match": {"name": asset_name}}, {"$limit": 1}]
    results = client_v1.aggregate_docdb_records(pipeline)
    
    if not results:
        return "not_found_in_v1"
    
    record = results[0]
    
    # Check for missing required files
    # Check if key exists (even if None/empty)
    has_procedures = "procedures" in record and bool(record.get("procedures"))
    has_data_description = "data_description" in record and bool(record.get("data_description"))
    has_acquisition_key = "acquisition" in record
    has_acquisition = has_acquisition_key and bool(record.get("acquisition"))
    has_rig = "rig" in record and bool(record.get("rig"))
    has_instrument = "instrument" in record and bool(record.get("instrument"))
    
    missing_files = []
    if not has_procedures:
        missing_files.append("procedures")
    if not has_data_description:
        missing_files.append("data_description")
    if not has_acquisition_key:
        missing_files.append("acquisition")
    if not has_rig and not has_instrument:
        missing_files.append("instrument")
    
    if missing_files:
        return f"missing_required_files: {', '.join(missing_files)}"
    
    # Try to upgrade to see what the actual error is
    # Suppress all output from the upgrade process
    with redirect_stdout(StringIO()), redirect_stderr(StringIO()):
        try:
            Upgrade(record, skip_metadata_validation=False)
            return "upgrade_succeeds"  # Shouldn't happen if not in v2
        except Exception as e:
            error_type = type(e).__name__
            error_msg = str(e)
            
            # Check for subject validation errors (e.g., missing genotype)
            if ("mousesubject" in error_msg.lower() or "subject" in error_msg.lower()) and ("genotype" in error_msg.lower() or error_type == "ValidationError"):
                return "subject validation error"
            
            # Check for acquisition validation errors
            if "acquisition" in error_msg.lower():
                # Check for instrument ID mismatch
                if "instrument id" in error_msg.lower() and "does not match" in error_msg.lower():
                    return "acquisition validation error: instrument ID mismatch"
                # Check for session_end_time specifically
                if "session_end_time" in error_msg.lower() and error_type == "ValidationError":
                    return "acquisition validation error: missing session.session_end_time"
                # Check for other datetime validation errors
                if (("datetime" in error_msg.lower() or 
                     "acquisition_end_time" in error_msg.lower() or 
                     "acquisition_start_time" in error_msg.lower()) and 
                    error_type == "ValidationError"):
                    return "acquisition validation error: missing datetime"
                # Other acquisition validation errors
                if "validation" in error_msg.lower() or error_type == "ValidationError":
                    return "acquisition validation error"
            
            # Check for instrument validation errors - label all as 'instrument upgrade error'
            if "instrument" in error_msg.lower() and ("validation" in error_msg.lower() or "harp_device_type" in error_msg or "is_clock_generator" in error_msg):
                return "instrument upgrade error"
            
            # Check for missing required files error
            if "required core files" in error_msg:
                return f"missing_required_files: {error_msg[:200]}"
            
            return f"validation_error: {error_type}: {error_msg[:200]}"

# Add failure_reason column for assets in v1 but not v2
failed_assets = df[df['in_v1'] & ~df['in_v2']]
failure_reasons = []

for idx, row in failed_assets.iterrows():
    asset_name = row['name']
    reason = get_failure_reason(asset_name)
    failure_reasons.append((asset_name, reason))

# Create a mapping and add to dataframe
reason_dict = dict(failure_reasons)
df['failure_reason'] = df.apply(
    lambda row: reason_dict.get(row['name'], '') if (row['in_v1'] and not row['in_v2']) else '',
    axis=1
)


In [14]:
# get value counts for failure reasons for assets in v1 but not v2
pd.DataFrame(df[df['in_v1'] & ~df['in_v2']]['failure_reason'].value_counts())


Unnamed: 0_level_0,count
failure_reason,Unnamed: 1_level_1
missing_required_files: procedures,642
instrument upgrade error,174
acquisition validation error: missing datetime,3
subject validation error,1
acquisition validation error: instrument ID mismatch,1


In [16]:
df[df['in_v1'] & ~df['in_v2']].query('failure_reason == "acquisition validation error: missing datetime"')['name'].tolist()

['behavior_794591_2025-09-16_20-00-01',
 'behavior_794591_2025-09-17_19-29-49',
 'behavior_794591_2025-09-18_20-24-31']

In [18]:
df[df['in_v1'] & ~df['in_v2']].query('failure_reason == "subject validation error"')

Unnamed: 0,name,subject_id,in_v1,in_v2,failure_reason
0,behavior_123456_2025-04-15_01-17-37,123456,True,False,subject validation error


In [8]:
# Get unique subject_ids (mice) with missing_required_files: procedures
missing_procedures = df[
    (df['in_v1'] & ~df['in_v2']) & 
    (df['failure_reason'] == 'missing_required_files: procedures')
]

unique_mice = missing_procedures['subject_id'].unique()
print(f"Unique mice with missing_required_files: procedures: {len(unique_mice)}")
print(f"\nSubject IDs:")
for subject_id in sorted(unique_mice):
    count = len(missing_procedures[missing_procedures['subject_id'] == subject_id])
    print(f"  {subject_id}: {count} sessions")

Unique mice with missing_required_files: procedures: 11

Subject IDs:
  745302: 81 sessions
  745305: 93 sessions
  754559: 81 sessions
  754560: 67 sessions
  754567: 75 sessions
  754571: 61 sessions
  754572: 17 sessions
  754573: 20 sessions
  754574: 59 sessions
  754575: 70 sessions
  754579: 18 sessions


In [11]:
df[df['in_v1'] & ~df['in_v2']].query('failure_reason == "acquisition validation error: instrument ID mismatch"')

Unnamed: 0,name,subject_id,in_v1,in_v2,failure_reason
332,behavior_745306_2024-10-17_13-33-18,745306,True,False,acquisition validation error: instrument ID mi...


In [15]:
df[df['in_v1'] & ~df['in_v2']].query('failure_reason == "instrument upgrade error"')

Unnamed: 0,name,subject_id,in_v1,in_v2,failure_reason
1865,behavior_789908_2025-09-23_20-11-30,789908,True,False,instrument upgrade error
1866,behavior_789908_2025-09-24_22-04-27,789908,True,False,instrument upgrade error
1867,behavior_789908_2025-09-25_22-12-33,789908,True,False,instrument upgrade error
1868,behavior_789908_2025-09-26_20-44-33,789908,True,False,instrument upgrade error
1869,behavior_789908_2025-09-29_21-00-05,789908,True,False,instrument upgrade error
...,...,...,...,...,...
3526,behavior_808729_2025-10-10_16-27-45,808729,True,False,instrument upgrade error
3551,behavior_815102_2025-10-10_20-18-17,815102,True,False,instrument upgrade error
3552,behavior_815102_2025-10-10_21-00-40,815102,True,False,instrument upgrade error
3555,behavior_815103_2025-10-20_22-04-36,815103,True,False,instrument upgrade error
