# HuBMAP Manifest Creation

This notebook creates a manifest file for downloading HuBMAP CODEX datasets.

**Process:**
1. Load CSV with HuBMAP IDs
2. Fetch UUIDs for descendant datasets
3. Check which files are available on HuBMAP assets server
4. Generate manifest file for download

**Usage:**
- Update the CSV file path in cell 2
- Update the output directory in cell 7
- Run all cells sequentially

In [1]:
# Import required libraries
import numpy as np 
import pandas as pd 
import os
import requests
import re
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

In [3]:
# Load CSV file with HuBMAP IDs
# CSV should have columns: 'Input_HuBMAP_ID' and 'Found_CODEX_IDs'
csv_path = '/u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/spleen-codex-ufl/data/descendant_hubmapID_spleen.csv'
df = pd.read_csv(csv_path)

print(f"Loaded {len(df)} records from {csv_path}")

Loaded 24 records from /u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/spleen-codex-ufl/data/descendant_hubmapID_spleen.csv


In [4]:
# Inspect data structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Input_HuBMAP_ID  24 non-null     object
 1   Found_CODEX_IDs  24 non-null     object
dtypes: object(2)
memory usage: 516.0+ bytes


In [5]:
# Preview data
df.head(10)

Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs
0,HBM244.TJLK.223,HBM339.LBCC.963
1,HBM267.BZKT.867,HBM946.WMTC.283
2,HBM337.FSXL.564,HBM279.RTXC.523
3,HBM342.FSLD.938,HBM496.ZJFC.554
4,HBM355.JDLK.244,HBM626.KXRZ.238
5,HBM374.LLKS.325,HBM443.TZCQ.232
6,HBM389.PKHL.936,HBM863.FDNH.844
7,HBM427.SMGB.866,HBM898.LWCS.878
8,HBM432.LLCF.677,HBM573.GQRD.788
9,HBM498.TCSV.345,HBM455.PWQW.883


In [6]:
# Fetch UUIDs for descendant CODEX datasets
# The Found_CODEX_IDs contain the actual datasets with the files we want to download

def get_last_uuid(hubmap_id):
    """Fetch UUID for a HuBMAP ID from the API"""
    base_url = "https://entity.api.hubmapconsortium.org/entities/"
    url = f"{base_url}{hubmap_id}"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            uuids = re.findall(r'"uuid"\s*:\s*"([a-f0-9\-]+)"', r.text)
            return uuids[-1] if uuids else None
        return None
    except Exception as e:
        print(f"  Error fetching UUID for {hubmap_id}: {e}")
        return None

print("Fetching UUIDs from HuBMAP API...")
df['last_uuid'] = [
    get_last_uuid(hid) for hid in tqdm(df['Found_CODEX_IDs'], desc="Fetching UUIDs")
]

# Show results
print(f"\n✓ UUID mapping complete")
print(f"  Total IDs: {len(df)}")
print(f"  With UUIDs: {df['last_uuid'].notna().sum()}")
print(f"  Missing UUIDs: {df['last_uuid'].isna().sum()}")

Fetching UUIDs from HuBMAP API...


Fetching UUIDs:   0%|          | 0/24 [00:00<?, ?it/s]

Fetching UUIDs: 100%|██████████| 24/24 [00:27<00:00,  1.15s/it]


✓ UUID mapping complete
  Total IDs: 24
  With UUIDs: 24
  Missing UUIDs: 0





In [7]:
df.head(23)

Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs,last_uuid
0,HBM244.TJLK.223,HBM339.LBCC.963,d2f39eb7da856f5ff344c2e907611b76
1,HBM267.BZKT.867,HBM946.WMTC.283,b6e00c907020456af2942d6c1576b100
2,HBM337.FSXL.564,HBM279.RTXC.523,586b77e11e6183de4363fe7a9385282f
3,HBM342.FSLD.938,HBM496.ZJFC.554,c4b216fbc950f8cdda0d261e585a2f3c
4,HBM355.JDLK.244,HBM626.KXRZ.238,0008a49ac06f4afd886be81491a5a926
5,HBM374.LLKS.325,HBM443.TZCQ.232,68c78cbcbcd6d4d7aec0b9792f4cceee
6,HBM389.PKHL.936,HBM863.FDNH.844,79ec9402362b276b9e12c0f596dbac4f
7,HBM427.SMGB.866,HBM898.LWCS.878,19184e64b152cd9977f56785da9495fd
8,HBM432.LLCF.677,HBM573.GQRD.788,730f81640c6573165c6bab8f82356b34
9,HBM498.TCSV.345,HBM455.PWQW.883,29b597eb148281332c8512b1ef38d580


In [8]:
# Configure output directory for manifest file
out_dir = "/u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/spleen-codex-ufl/data"

if not os.path.exists(out_dir):
    print(f"Creating directory: {out_dir}")
    os.makedirs(out_dir, exist_ok=True)
else:
    print(f"✓ Directory exists: {out_dir}")

✓ Directory exists: /u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/spleen-codex-ufl/data


In [9]:
# Configure file paths and checking function

# Define possible expression file paths (will try both for each dataset)
primary_expr_path = "pipeline_output/expr/reg001_expr.ome.tiff"
alternative_expr_path = "stitched/expressions/reg1_stitched_expressions.ome.tiff"
config_path = "pipelineConfig.json"

# HuBMAP assets server base URL
ASSETS_BASE_URL = "https://assets.hubmapconsortium.org"

print("Configuration:")
print(f"  Primary path: {primary_expr_path}")
print(f"  Alternative path: {alternative_expr_path}")
print(f"  Config path: {config_path}")

def check_url_exists(url, timeout=10):
    """
    Check if a file exists on HuBMAP assets server.
    Uses GET request with byte range to avoid downloading large files.
    """
    try:
        response = requests.get(
            url, 
            timeout=timeout, 
            stream=True, 
            headers={'Range': 'bytes=0-1'}, 
            allow_redirects=True
        )
        # 200 = OK, 206 = Partial Content, 416 = Range Not Satisfiable (file exists)
        return response.status_code in [200, 206, 416]
    except Exception:
        return False

print("✓ Configuration complete")

Configuration:
  Primary path: pipeline_output/expr/reg001_expr.ome.tiff
  Alternative path: stitched/expressions/reg1_stitched_expressions.ome.tiff
  Config path: pipelineConfig.json
✓ Configuration complete


In [10]:
# Check which files are available on HuBMAP assets server

manifest_lines = []
found_count = 0
not_found_count = 0
missing_uuid_count = 0

print(f"Checking file availability for {len(df)} datasets...")
print("-" * 60)

for idx, row in df.iterrows():
    hubmap_id = row['Found_CODEX_IDs']
    uuid = row['last_uuid']
    
    print(f"\n[{idx+1}/{len(df)}] {hubmap_id}")
    
    # Skip if UUID is missing
    if pd.isna(uuid):
        print(f"  ✗ No UUID found")
        missing_uuid_count += 1
        continue
    
    print(f"  UUID: {uuid}")
    
    # Build URLs to check
    primary_url = f"{ASSETS_BASE_URL}/{uuid}/{primary_expr_path}"
    alternative_url = f"{ASSETS_BASE_URL}/{uuid}/{alternative_expr_path}"
    
    # Check which expression file exists
    expr_file_to_use = None
    
    print(f"  Checking primary path...", end=" ")
    if check_url_exists(primary_url):
        expr_file_to_use = primary_expr_path
        print(f"✓ Found")
    else:
        print(f"✗ Not found")
        print(f"  Checking alternative path...", end=" ")
        if check_url_exists(alternative_url):
            expr_file_to_use = alternative_expr_path
            print(f"✓ Found")
        else:
            print(f"✗ Not found")
            not_found_count += 1
            continue
    
    # Add to manifest (use HuBMAP ID, not UUID)
    manifest_lines.append(f"{hubmap_id} /{expr_file_to_use}")
    manifest_lines.append(f"{hubmap_id} /{config_path}")
    found_count += 1

# Print summary
print("\n" + "=" * 60)
print(f"Summary:")
print(f"  ✓ {found_count} datasets with files available")
print(f"  ✗ {not_found_count} datasets with missing files")
print(f"  ✗ {missing_uuid_count} datasets with missing UUIDs")
print(f"  Total manifest lines: {len(manifest_lines)}")
print("=" * 60)

Checking file availability for 24 datasets...
------------------------------------------------------------

[1/24] HBM339.LBCC.963
  UUID: d2f39eb7da856f5ff344c2e907611b76
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[2/24] HBM946.WMTC.283
  UUID: b6e00c907020456af2942d6c1576b100
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[3/24] HBM279.RTXC.523
  UUID: 586b77e11e6183de4363fe7a9385282f
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[4/24] HBM496.ZJFC.554
  UUID: c4b216fbc950f8cdda0d261e585a2f3c
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[5/24] HBM626.KXRZ.238
  UUID: 0008a49ac06f4afd886be81491a5a926
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[6/24] HBM443.TZCQ.232
  UUID: 68c78cbcbcd6d4d7aec0b9792f4cceee
  Checking primary path... ✓ Found

[7/24] HBM863.FDNH.844
  UUID: 79ec9402362b276b9e12c0f596dbac4f
  Checking pr

In [None]:
# Save manifest to file
manifest_path = os.path.join(out_dir, "manifest_spleen.txt")

with open(manifest_path, "w") as f:
    f.write("\n".join(manifest_lines))

print(f"✓ Manifest saved to: {manifest_path}")
print(f"  Total lines: {len(manifest_lines)}")
print(f"\nTo download files, run:")
print(f"  hubmap-clt transfer {manifest_path} -d /path/to/destination/")

✓ Manifest saved to: /u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/spleen-codex-ufl/data/manifest.txt
  Total lines: 48

To download files, run:
  hubmap-clt transfer /u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/spleen-codex-ufl/data/manifest.txt -d /path/to/destination/


In [12]:
# Preview manifest contents
print("=" * 70)
print("MANIFEST PREVIEW (first 20 lines):")
print("=" * 70)

with open(manifest_path, "r") as f:
    lines = f.readlines()
    for i, line in enumerate(lines[:20], 1):
        print(f"{i:3}. {line.strip()}")
    
    if len(lines) > 20:
        print(f"\n... and {len(lines) - 20} more lines")

print("=" * 70)

MANIFEST PREVIEW (first 20 lines):
  1. HBM339.LBCC.963 /stitched/expressions/reg1_stitched_expressions.ome.tiff
  2. HBM339.LBCC.963 /pipelineConfig.json
  3. HBM946.WMTC.283 /stitched/expressions/reg1_stitched_expressions.ome.tiff
  4. HBM946.WMTC.283 /pipelineConfig.json
  5. HBM279.RTXC.523 /stitched/expressions/reg1_stitched_expressions.ome.tiff
  6. HBM279.RTXC.523 /pipelineConfig.json
  7. HBM496.ZJFC.554 /stitched/expressions/reg1_stitched_expressions.ome.tiff
  8. HBM496.ZJFC.554 /pipelineConfig.json
  9. HBM626.KXRZ.238 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 10. HBM626.KXRZ.238 /pipelineConfig.json
 11. HBM443.TZCQ.232 /pipeline_output/expr/reg001_expr.ome.tiff
 12. HBM443.TZCQ.232 /pipelineConfig.json
 13. HBM863.FDNH.844 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 14. HBM863.FDNH.844 /pipelineConfig.json
 15. HBM898.LWCS.878 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 16. HBM898.LWCS.878 /pipelineConfig.json
 17. HBM573.GQRD.78