# Name Resolution Prototype

Test CURIE-to-common-name resolution using the **Node Normalizer API** on cached query data.

**Goal:** Validate that we can efficiently resolve human-readable names for all node types (Gene, Chemical, Protein, etc.) and cache them.

**Note:** TCT's `name_resolver.batch_lookup()` is for text/symbol searches, NOT CURIE resolution. We use the Node Normalizer API instead.

In [None]:
import json
import time
import requests
from pathlib import Path
from collections import Counter, defaultdict

# Node Normalizer API endpoint
NODE_NORMALIZER_URL = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes"

## 1. Load Cached Query Data

In [None]:
# Load most recent cached query
cache_dir = Path("../data/cache")
cache_files = sorted(cache_dir.glob("tct_results_*.json"), key=lambda x: x.stat().st_mtime, reverse=True)

print(f"Found {len(cache_files)} cached queries")
print(f"\nMost recent: {cache_files[0].name}")

# Load the most recent one
with open(cache_files[0]) as f:
    cached_data = json.load(f)

print(f"\nLoaded cache with {len(cached_data['edges'])} edges")
print(f"Query genes: {cached_data['input_genes'][:5]}...")
print(f"Target disease: {cached_data['target_disease']}")

## 2. Extract Unique CURIEs from Edges

In [None]:
# Extract all unique CURIEs (subjects + objects)
unique_curies = set()
for edge in cached_data['edges']:
    unique_curies.add(edge.get('subject', ''))
    unique_curies.add(edge.get('object', ''))
unique_curies.discard('')
unique_curies = list(unique_curies)

print(f"Unique CURIEs: {len(unique_curies)}")

# Analyze CURIE prefixes to understand node types
prefixes = Counter(c.split(':')[0] for c in unique_curies if ':' in c)
print(f"\nCURIE prefix distribution:")
for prefix, count in prefixes.most_common(15):
    print(f"  {prefix}: {count}")

## 3. Test Name Resolution with Node Normalizer API

In [None]:
# Batch lookup names via Node Normalizer
print(f"Looking up names for {len(unique_curies)} CURIEs via Node Normalizer...")

start_time = time.time()
response = requests.post(NODE_NORMALIZER_URL, json={"curies": unique_curies}, timeout=30)
elapsed = time.time() - start_time

data = response.json()

print(f"Lookup completed in {elapsed:.2f} seconds")
print(f"Response status: {response.status_code}")

In [None]:
# Build curie_to_name mapping
curie_to_name = {}
resolved_count = 0
fallback_count = 0

for curie in unique_curies:
    result = data.get(curie)
    if result and result.get('id', {}).get('label'):
        curie_to_name[curie] = result['id']['label']
        resolved_count += 1
    else:
        curie_to_name[curie] = curie  # Fallback to CURIE itself
        fallback_count += 1

print(f"\nResolution results:")
print(f"  Resolved to name: {resolved_count} ({100*resolved_count/len(unique_curies):.1f}%)")
print(f"  Fallback to CURIE: {fallback_count} ({100*fallback_count/len(unique_curies):.1f}%)")

## 4. Inspect Results by CURIE Type

In [None]:
# Show sample resolutions by prefix type
by_prefix = defaultdict(list)
for curie, name in curie_to_name.items():
    prefix = curie.split(':')[0] if ':' in curie else 'unknown'
    by_prefix[prefix].append((curie, name))

print("Sample resolutions by type:")
print("=" * 60)
for prefix in ['NCBIGene', 'CHEBI', 'UniProtKB', 'MONDO', 'GO', 'HP', 'HGNC']:
    if prefix in by_prefix:
        print(f"\n{prefix}:")
        for curie, name in by_prefix[prefix][:5]:
            resolved = "" if curie != name else " (fallback)"
            print(f"  {curie} -> {name}{resolved}")

In [None]:
# Check which types have lowest resolution rates
print("\nResolution rate by prefix:")
print("=" * 40)
for prefix, items in sorted(by_prefix.items(), key=lambda x: -len(x[1]))[:10]:
    resolved = sum(1 for c, n in items if c != n)
    total = len(items)
    rate = 100 * resolved / total if total > 0 else 0
    print(f"  {prefix}: {resolved}/{total} ({rate:.0f}%)")

## 5. Measure Storage Impact

In [None]:
# Calculate how much storage the curie_to_name mapping adds

# Serialize to JSON to measure actual storage
name_mapping_json = json.dumps(curie_to_name)
name_mapping_size_kb = len(name_mapping_json) / 1024

# Compare to original cache size
original_size_kb = cache_files[0].stat().st_size / 1024

print(f"Storage analysis:")
print(f"  Original cache size: {original_size_kb:.1f} KB")
print(f"  curie_to_name size: {name_mapping_size_kb:.1f} KB")
print(f"  Increase: {100 * name_mapping_size_kb / original_size_kb:.1f}%")

## 6. Create Helper Function for Implementation

In [None]:
def resolve_curie_names(curies: list, timeout: int = 30) -> dict:
    """
    Resolve CURIEs to human-readable names using Node Normalizer API.
    
    Args:
        curies: List of CURIE strings (e.g., ['NCBIGene:7124', 'CHEBI:15377'])
        timeout: Request timeout in seconds
    
    Returns:
        Dict mapping CURIE to name. Falls back to CURIE itself if not found.
    """
    NODE_NORMALIZER_URL = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes"
    
    curie_to_name = {}
    
    try:
        response = requests.post(
            NODE_NORMALIZER_URL,
            json={"curies": curies},
            timeout=timeout
        )
        response.raise_for_status()
        data = response.json()
        
        for curie in curies:
            result = data.get(curie)
            if result and result.get('id', {}).get('label'):
                curie_to_name[curie] = result['id']['label']
            else:
                curie_to_name[curie] = curie  # Fallback
    
    except Exception as e:
        print(f"Name resolution failed: {e}")
        # Fallback: use CURIEs as names
        curie_to_name = {curie: curie for curie in curies}
    
    return curie_to_name

# Test the function
test_result = resolve_curie_names(['NCBIGene:7124', 'CHEBI:15377', 'MONDO:0004975'])
print("Test resolve_curie_names():")
for curie, name in test_result.items():
    print(f"  {curie} -> {name}")

## 7. Summary

In [None]:
print("=" * 60)
print("NAME RESOLUTION PROTOTYPE SUMMARY")
print("=" * 60)
print(f"\nCache file: {cache_files[0].name}")
print(f"Edges: {len(cached_data['edges'])}")
print(f"Unique CURIEs: {len(unique_curies)}")
print(f"\nResolution:")
print(f"  - API: Node Normalizer (https://nodenormalization-sri.renci.org)")
print(f"  - Time: {elapsed:.2f} seconds")
print(f"  - Success rate: {100*resolved_count/len(unique_curies):.1f}%")
print(f"  - Storage overhead: {name_mapping_size_kb:.1f} KB ({100 * name_mapping_size_kb / original_size_kb:.1f}% of cache)")
print(f"\nConclusion: Ready to implement in production code!")
print(f"\nNext steps:")
print(f"  1. Add resolve_curie_names() to trapi_client.py")
print(f"  2. Call after post-filtering, before caching")
print(f"  3. Store curie_to_name in response metadata")