In [17]:
#!/usr/bin/env python3
"""
Specimen Procedures Data Quality Validation

This notebook validates the quality of specimen procedures data by comparing:
1. Database records vs Excel spreadsheet data
2. Specimen ID correctness
3. Lot number accuracy
4. Collection date validation

For each of the 31 PATCHSEQ mice, we'll check:
- specimen_procedures_in_db: Are specimen procedures populated?
- specimen_id_correct: Do specimen IDs match subject IDs?
- lot_numbers_correct: Do lot numbers match the Excel spreadsheet?
"""

import pandas as pd
import json
import os
from datetime import datetime
from typing import Dict, List, Any, Tuple
import warnings
warnings.filterwarnings('ignore')
import requests

# Import functions from existing modules
from excel_loader import load_specimen_procedures_excel, get_specimen_procedures_for_subject

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Prevent text truncation in cells
pd.set_option('display.max_colwidth', None)

# Also useful for preventing row/column truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# If you want to see the full content but with some reasonable limits
pd.set_option('display.max_colwidth', 1000)  # Show up to 1000 characters per cell


In [16]:

# PATCHSEQ Subject IDs
PATCHSEQ_SUBJECT_IDS = [
    "716946", "716947", "716948", "716949", "716950", "716951",
    "725231", "725328", "725329", "728854", "731907", "737038",
    "737042", "744117", "744119", "746041", "746043", "746045",
    "746046", "751017", "751019", "751023", "751024", "751035",
    "755069", "755071", "755072", "755073", "755790", "762196", "762199"
]

# Load the Excel spreadsheet data
EXCEL_FILE = "DT_HM_TissueClearingTracking_.xlsx"
print("Loading Excel spreadsheet data...")
sheet_data = load_specimen_procedures_excel(EXCEL_FILE)


Loading Excel spreadsheet data...


In [26]:
def query_smartspim_assets() -> List[Dict[str, Any]]:
    """
    Query the AIND metadata database for all SmartSPIM assets for PATCHSEQ subjects.
    
    Returns:
        List of asset dictionaries with metadata
    """
    
    # Use the find endpoint with filters and projection
    url = "https://api.allenneuraldynamics.org/v1/metadata_index/data_assets/find"
    
    # Query parameters
    params = {
        "filter": json.dumps({
            "subject.subject_id": {"$in": PATCHSEQ_SUBJECT_IDS},
            "data_description.modality": {
                "$elemMatch": {
                    "abbreviation": "SPIM"
                }
            }
        }),
        "projection": json.dumps({
            "_id": 0,
            "procedures": 1,
            "subject.subject_id": 1,
            "data_description.name": 1,
            "data_description.data_level": 1
        }),
        "limit": 100  # Increase if needed
    }
    
    try:
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()
        assets = response.json()
        
        # Transform the response to match expected format
        transformed_assets = []
        for asset in assets:
            transformed_asset = {
                "subject_id": asset.get("subject", {}).get("subject_id"),
                "asset_name": asset.get("data_description", {}).get("name"),
                "data_level": asset.get("data_description", {}).get("data_level"),
                "procedures": asset.get("procedures", {})
            }
            transformed_assets.append(transformed_asset)
        
        return transformed_assets
        
    except requests.exceptions.RequestException as e:
        return []

def check_specimen_procedures_in_asset(procedures: Dict[str, Any]) -> bool:
    """
    Check if specimen procedures exist in the asset procedures.
    
    Args:
        procedures: The procedures dictionary from the asset
        
    Returns:
        bool: True if specimen procedures exist and are populated
    """
    if not procedures:
        return False
    
    if 'specimen_procedures' not in procedures:
        return False
    
    specimen_procedures = procedures['specimen_procedures']
    return isinstance(specimen_procedures, list) and len(specimen_procedures) > 0

def check_specimen_id_correctness_in_asset(subject_id: str, procedures: Dict[str, Any]) -> bool:
    """
    Check if all specimen IDs in procedures match the subject ID.
    
    Args:
        subject_id: The subject ID
        procedures: The procedures dictionary from the asset
        
    Returns:
        bool: True if all specimen IDs match, False otherwise
    """
    if not check_specimen_procedures_in_asset(procedures):
        return False
    
    try:
        specimen_procedures = procedures.get('specimen_procedures', [])
        
        for proc in specimen_procedures:
            proc_specimen_id = proc.get('specimen_id')
            if proc_specimen_id != subject_id:
                return False
        
        return True
        
    except Exception as e:
        print(f"Error checking specimen ID correctness for {subject_id}: {e}")
        return False

def extract_lot_numbers_from_asset_procedures(procedures: Dict[str, Any]) -> List[Tuple[str, str]]:
    """
    Extract lot numbers from asset procedures.
    
    Args:
        procedures: The procedures dictionary from the asset
        
    Returns:
        List of tuples: (reagent_name, lot_number)
    """
    lot_numbers = []
    
    try:
        specimen_procedures = procedures.get('specimen_procedures', [])
        
        for proc in specimen_procedures:
            # Check for 'reagents' field instead of 'procedure_details'
            reagents = proc.get('reagents', [])
            
            for reagent in reagents:
                if isinstance(reagent, dict):
                    # Handle dict format
                    if 'name' in reagent and 'lot_number' in reagent:
                        lot_numbers.append((reagent['name'], reagent['lot_number']))
                    # Also check for alternative field names
                    elif 'material_type' in reagent and 'lot_number' in reagent:
                        lot_numbers.append((reagent['material_type'], reagent['lot_number']))
                else:
                    # Handle object format (if it has attributes)
                    try:
                        if hasattr(reagent, 'name') and hasattr(reagent, 'lot_number'):
                            lot_numbers.append((reagent.name, reagent.lot_number))
                        elif hasattr(reagent, 'material_type') and hasattr(reagent, 'lot_number'):
                            lot_numbers.append((reagent.material_type, reagent.lot_number))
                    except:
                        continue
        
        return lot_numbers
        
    except Exception as e:
        print(f"Error extracting lot numbers: {e}")
        return []

def check_lot_numbers_correct_for_asset(subject_id: str, procedures: Dict[str, Any], sheet_data: Dict) -> bool:
    """
    Check if lot numbers in asset procedures match those from Excel spreadsheet.
    
    Args:
        subject_id: The subject ID
        procedures: The procedures dictionary from the asset
        sheet_data: Dictionary of DataFrames from load_specimen_procedures_excel()
        
    Returns:
        bool: True if lot numbers match, False otherwise
    """
    if not check_specimen_procedures_in_asset(procedures):
        return False
    
    try:
        db_lot_numbers = extract_lot_numbers_from_asset_procedures(procedures)
        excel_lot_numbers = extract_lot_numbers_from_excel(subject_id, sheet_data)
        
        if not db_lot_numbers and not excel_lot_numbers:
            return True  # Both empty, consider correct
        
        if not db_lot_numbers or not excel_lot_numbers:
            return False
        
        # Convert to sets for comparison (order doesn't matter)
        db_set = set(db_lot_numbers)
        excel_set = set(excel_lot_numbers)
        
        if db_set != excel_set:
            return False
        
        return True
        
    except Exception as e:
        print(f"Error checking lot numbers for {subject_id}: {e}")
        return False

def validate_all_assets() -> pd.DataFrame:
    """
    Validate all SmartSPIM assets and return a summary DataFrame.
    
    Returns:
        pd.DataFrame: Validation results for all assets
    """
    
    # First get all SmartSPIM assets
    assets = query_smartspim_assets()
    
    if not assets:
        print("No assets found or query failed.")
        return pd.DataFrame()
    
    
    results = []
    
    for asset in assets:
        subject_id = asset['subject_id']
        asset_name = asset['asset_name']
        data_level = asset['data_level']
        procedures = asset['procedures']
        
        # Get collection date from Excel
        collection_date = get_collection_date_from_excel(subject_id, sheet_data)
        
        # Check if specimen procedures exist in this asset
        specimen_procedures_in_db = check_specimen_procedures_in_asset(procedures)
        
        # Check specimen ID correctness for this asset
        specimen_id_correct = check_specimen_id_correctness_in_asset(subject_id, procedures)
        
        # Check lot numbers correctness for this asset
        lot_numbers_correct = check_lot_numbers_correct_for_asset(subject_id, procedures, sheet_data)
        
        result = {
            'subject_id': subject_id,
            'collection_date': collection_date,
            'asset_name': asset_name,
            'data_level': data_level,
            'specimen_procedures_in_db': specimen_procedures_in_db,
            'specimen_id_correct': specimen_id_correct,
            'lot_numbers_correct': lot_numbers_correct
        }
        
        results.append(result)
    
    df = pd.DataFrame(results)
    
    return df

In [27]:
validation_df = validate_all_assets().sort_values(by='data_level', ascending=False).sort_values(by='subject_id', ascending=True)
validation_df

Unnamed: 0,subject_id,collection_date,asset_name,data_level,specimen_procedures_in_db,specimen_id_correct,lot_numbers_correct
58,716946,2024-04-19,SmartSPIM_716946_2024-06-17_10-51-38,raw,True,False,False
50,716946,2024-04-19,SmartSPIM_716946_2024-06-17_10-51-38_stitched_2024-06-19_05-06-56,derived,True,False,False
60,716947,2024-04-19,SmartSPIM_716947_2024-06-17_16-49-13,raw,True,False,False
59,716947,2024-04-19,SmartSPIM_716947_2024-06-17_16-49-13_stitched_2024-06-19_11-21-35,derived,True,False,False
61,716948,2024-04-01,SmartSPIM_716948_2024-05-24_15-47-29,raw,True,False,False
62,716948,2024-04-01,SmartSPIM_716948_2024-05-24_15-47-29_stitched_2024-05-26_09-48-20,derived,True,False,False
55,716949,2024-03-22,SmartSPIM_716949_2024-04-23_12-53-35_stitched_2024-04-24_15-46-00,derived,True,False,False
4,716949,2024-03-22,SmartSPIM_716949_2024-04-23_12-53-35,raw,True,False,False
56,716949,2024-03-22,SmartSPIM_716949_2024-08-13_14-08-24,raw,True,False,False
5,716949,2024-03-22,SmartSPIM_716949_2024-08-13_14-08-24_stitched_2024-12-10_19-08-04,derived,True,False,False


In [30]:
def comprehensive_debug_asset(subject_id: str, asset_name: str):
    """
    Comprehensive debugging for a specific asset to see exactly what's happening.
    """
    print(f"\n{'='*80}")
    print(f"COMPREHENSIVE DEBUG FOR: {subject_id} - {asset_name}")
    print(f"{'='*80}")
    
    # Get the asset data
    assets = query_smartspim_assets()
    asset = None
    for a in assets:
        if a['subject_id'] == subject_id and a['asset_name'] == asset_name:
            asset = a
            break
    
    if not asset:
        print("Asset not found!")
        return
    
    procedures = asset['procedures']
    
    # 1. Check if specimen procedures exist
    has_specimen = check_specimen_procedures_in_asset(procedures)
    print(f"\n1. SPECIMEN PROCEDURES EXIST: {has_specimen}")
    
    if not has_specimen:
        print("   No specimen procedures - cannot proceed with lot number check")
        return
    
    # 2. Show specimen procedures structure
    specimen_procedures = procedures.get('specimen_procedures', [])
    print(f"\n2. SPECIMEN PROCEDURES COUNT: {len(specimen_procedures)}")
    
    for i, proc in enumerate(specimen_procedures):
        print(f"\n   Procedure {i+1}:")
        print(f"     Type: {proc.get('procedure_type', 'N/A')}")
        print(f"     Name: {proc.get('procedure_name', 'N/A')}")
        print(f"     Keys: {list(proc.keys())}")
        
        # Check for reagents
        reagents = proc.get('reagents', [])
        print(f"     Reagents count: {len(reagents)}")
        
        for j, reagent in enumerate(reagents):
            print(f"       Reagent {j+1}:")
            if isinstance(reagent, dict):
                print(f"         Type: {type(reagent)}")
                print(f"         Keys: {list(reagent.keys())}")
                print(f"         Content: {reagent}")
            else:
                print(f"         Type: {type(reagent)}")
                print(f"         Attributes: {dir(reagent)}")
    
    # 3. Extract lot numbers from DB
    print(f"\n3. EXTRACTING LOT NUMBERS FROM DB:")
    db_lot_numbers = extract_lot_numbers_from_asset_procedures(procedures)
    print(f"   Found {len(db_lot_numbers)} lot numbers:")
    for name, lot in db_lot_numbers:
        print(f"     {name}: {lot}")
    
    # 4. Extract lot numbers from Excel
    print(f"\n4. EXTRACTING LOT NUMBERS FROM EXCEL:")
    excel_lot_numbers = extract_lot_numbers_from_excel(subject_id, sheet_data)
    print(f"   Found {len(excel_lot_numbers)} lot numbers:")
    for name, lot in excel_lot_numbers:
        print(f"     {name}: {lot}")
    
    # 5. Detailed comparison
    print(f"\n5. DETAILED COMPARISON:")
    db_set = set(db_lot_numbers)
    excel_set = set(excel_lot_numbers)
    
    print(f"   DB set: {db_set}")
    print(f"   Excel set: {excel_set}")
    print(f"   Sets equal: {db_set == excel_set}")
    
    if db_set != excel_set:
        only_in_db = db_set - excel_set
        only_in_excel = excel_set - db_set
        
        if only_in_db:
            print(f"   Only in DB: {only_in_db}")
        if only_in_excel:
            print(f"   Only in Excel: {only_in_excel}")
        
        # Check for name mismatches
        print(f"\n   Checking for name mismatches:")
        db_names = {name for name, _ in db_lot_numbers}
        excel_names = {name for name, _ in excel_lot_numbers}
        
        print(f"     DB reagent names: {db_names}")
        print(f"     Excel reagent names: {excel_names}")
        
        name_mismatches = db_names.symmetric_difference(excel_names)
        if name_mismatches:
            print(f"     Name mismatches: {name_mismatches}")
    
    # 6. Test the actual check function
    print(f"\n6. FINAL CHECK FUNCTION RESULT:")
    result = check_lot_numbers_correct_for_asset(subject_id, procedures, sheet_data)
    print(f"   lot_numbers_correct: {result}")
    
    # 7. Show raw data for manual inspection
    print(f"\n7. RAW PROCEDURES DATA (first 1000 chars):")
    raw_data = str(procedures)
    print(raw_data[:1000])
    if len(raw_data) > 1000:
        print(f"   ... (truncated, total length: {len(raw_data)})")

In [31]:
comprehensive_debug_asset("716946", "SmartSPIM_716946_2024-06-17_10-51-38")


COMPREHENSIVE DEBUG FOR: 716946 - SmartSPIM_716946_2024-06-17_10-51-38

1. SPECIMEN PROCEDURES EXIST: True

2. SPECIMEN PROCEDURES COUNT: 6

   Procedure 1:
     Type: Fixation
     Name: SHIELD OFF
     Keys: ['procedure_type', 'procedure_name', 'specimen_id', 'start_date', 'end_date', 'experimenter_full_name', 'protocol_id', 'reagents', 'hcr_series', 'antibodies', 'sectioning', 'notes', 'subject_id']
     Reagents count: 2
       Reagent 1:
         Type: <class 'dict'>
         Keys: ['name', 'source', 'rrid', 'lot_number', 'expiration_date']
         Content: {'name': 'SHIELD Epoxy', 'source': {'name': 'LifeCanvas', 'abbreviation': None, 'registry': None, 'registry_identifier': None}, 'rrid': None, 'lot_number': 'ex35', 'expiration_date': None}
       Reagent 2:
         Type: <class 'dict'>
         Keys: ['name', 'source', 'rrid', 'lot_number', 'expiration_date']
         Content: {'name': 'SHIELD Buffer', 'source': {'name': 'LifeCanvas', 'abbreviation': None, 'registry': None, 

In [None]:
df = analyze_smartspim_assets()

Querying AIND metadata database for SmartSPIM assets...
Found 66 SmartSPIM assets. Analyzing procedures...
Analysis complete. Found 66 total assets.
Subjects with SmartSPIM data: 31
Total subjects in PATCHSEQ list: 31


In [None]:
df = analyze_smartspim_assets()

Querying AIND metadata database for SmartSPIM assets...
Found 66 SmartSPIM assets. Analyzing procedures...
Analysis complete. Found 66 total assets.
Subjects with SmartSPIM data: 31
Total subjects in PATCHSEQ list: 31


In [None]:
df = analyze_smartspim_assets()

Querying AIND metadata database for SmartSPIM assets...
Found 66 SmartSPIM assets. Analyzing procedures...
Analysis complete. Found 66 total assets.
Subjects with SmartSPIM data: 31
Total subjects in PATCHSEQ list: 31


In [None]:
df = analyze_smartspim_assets()

Querying AIND metadata database for SmartSPIM assets...
Found 66 SmartSPIM assets. Analyzing procedures...
Analysis complete. Found 66 total assets.
Subjects with SmartSPIM data: 31
Total subjects in PATCHSEQ list: 31
