# HuBMAP Manifest Creation

This notebook creates a manifest file for downloading HuBMAP CODEX datasets.

**Process:**
1. Load CSV with HuBMAP IDs
2. Fetch UUIDs for descendant datasets
3. Check which files are available on HuBMAP assets server
4. Generate manifest file for download

**Usage:**
- Update the CSV file path in cell 2
- Update the output directory in cell 7
- Run all cells sequentially

In [2]:
# Import required libraries
import numpy as np 
import pandas as pd 
import os
import requests
import re
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

In [3]:
# Load CSV file with HuBMAP IDs
# CSV should have columns: 'Input_HuBMAP_ID' and 'Found_CODEX_IDs'
csv_path = '/u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/thymus-codex-ufl/data/descendant_hubmapID_thymus_01.csv'
df = pd.read_csv(csv_path)

print(f"Loaded {len(df)} records from {csv_path}")

Loaded 12 records from /u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/thymus-codex-ufl/data/descendant_hubmapID_thymus_01.csv


In [4]:
# Inspect data structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Input_HuBMAP_ID  12 non-null     object
 1   Found_CODEX_IDs  12 non-null     object
dtypes: object(2)
memory usage: 324.0+ bytes


In [5]:
# Preview data
df.head(10)

Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs
0,HBM979.JTRM.429,HBM632.JSNP.578
1,HBM892.HBJW.994,HBM792.GKBV.697
2,HBM757.VHCK.858,HBM887.SHVF.747
3,HBM288.XSQZ.633,HBM373.LDGF.766
4,HBM769.NBQL.534,HBM465.HZHH.676
5,HBM654.KTJK.968,HBM597.KZXW.469
6,HBM597.QSZV.956,HBM893.CCKX.496
7,HBM374.DMXZ.352,HBM857.ZBDC.975
8,HBM339.XXWC.842,HBM545.WPDT.262
9,HBM588.FHDS.363,HBM356.BVRR.244


In [6]:
# Fetch UUIDs for descendant CODEX datasets
# The Found_CODEX_IDs contain the actual datasets with the files we want to download

def get_last_uuid(hubmap_id):
    """Fetch UUID for a HuBMAP ID from the API"""
    base_url = "https://entity.api.hubmapconsortium.org/entities/"
    url = f"{base_url}{hubmap_id}"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            uuids = re.findall(r'"uuid"\s*:\s*"([a-f0-9\-]+)"', r.text)
            return uuids[-1] if uuids else None
        return None
    except Exception as e:
        print(f"  Error fetching UUID for {hubmap_id}: {e}")
        return None

print("Fetching UUIDs from HuBMAP API...")
df['last_uuid'] = [
    get_last_uuid(hid) for hid in tqdm(df['Found_CODEX_IDs'], desc="Fetching UUIDs")
]

# Show results
print(f"\n✓ UUID mapping complete")
print(f"  Total IDs: {len(df)}")
print(f"  With UUIDs: {df['last_uuid'].notna().sum()}")
print(f"  Missing UUIDs: {df['last_uuid'].isna().sum()}")

Fetching UUIDs from HuBMAP API...


Fetching UUIDs:   0%|          | 0/12 [00:00<?, ?it/s]

Fetching UUIDs: 100%|██████████| 12/12 [00:02<00:00,  5.50it/s]


✓ UUID mapping complete
  Total IDs: 12
  With UUIDs: 12
  Missing UUIDs: 0





In [7]:
df.head(23)

Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs,last_uuid
0,HBM979.JTRM.429,HBM632.JSNP.578,f3a7a9ab7797d7d217e7bab92bc24d87
1,HBM892.HBJW.994,HBM792.GKBV.697,1bafd9eca3a630ebe851c8d2fe8cfa6a
2,HBM757.VHCK.858,HBM887.SHVF.747,d4e9ec618924a8d43cfe1e67c38c1447
3,HBM288.XSQZ.633,HBM373.LDGF.766,43213991a54ce196d406707ffe2e86bd
4,HBM769.NBQL.534,HBM465.HZHH.676,822c9163d3be9b427dd0830f69a12305
5,HBM654.KTJK.968,HBM597.KZXW.469,323989e9d095fb7dce303602298adc99
6,HBM597.QSZV.956,HBM893.CCKX.496,37d06bb991afa2beb7b9460e746247ad
7,HBM374.DMXZ.352,HBM857.ZBDC.975,0223a8b1c1acc1495d2f3934397b5a4b
8,HBM339.XXWC.842,HBM545.WPDT.262,6f74609152fe7fd81974b7452112d2e2
9,HBM588.FHDS.363,HBM356.BVRR.244,d0fc985c048c4bfb026838466891c350


In [8]:
# Configure output directory for manifest file
out_dir = "/u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/thymus-codex-ufl/data"

if not os.path.exists(out_dir):
    print(f"Creating directory: {out_dir}")
    os.makedirs(out_dir, exist_ok=True)
else:
    print(f"✓ Directory exists: {out_dir}")

✓ Directory exists: /u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/thymus-codex-ufl/data


In [9]:
# Configure file paths and checking function

# Define possible expression file paths (will try both for each dataset)
primary_expr_path = "pipeline_output/expr/reg001_expr.ome.tiff"
alternative_expr_path = "stitched/expressions/reg1_stitched_expressions.ome.tiff"
config_path = "pipelineConfig.json"

# HuBMAP assets server base URL
ASSETS_BASE_URL = "https://assets.hubmapconsortium.org"

print("Configuration:")
print(f"  Primary path: {primary_expr_path}")
print(f"  Alternative path: {alternative_expr_path}")
print(f"  Config path: {config_path}")

def check_url_exists(url, timeout=10):
    """
    Check if a file exists on HuBMAP assets server.
    Uses GET request with byte range to avoid downloading large files.
    """
    try:
        response = requests.get(
            url, 
            timeout=timeout, 
            stream=True, 
            headers={'Range': 'bytes=0-1'}, 
            allow_redirects=True
        )
        # 200 = OK, 206 = Partial Content, 416 = Range Not Satisfiable (file exists)
        return response.status_code in [200, 206, 416]
    except Exception:
        return False

print("✓ Configuration complete")

Configuration:
  Primary path: pipeline_output/expr/reg001_expr.ome.tiff
  Alternative path: stitched/expressions/reg1_stitched_expressions.ome.tiff
  Config path: pipelineConfig.json
✓ Configuration complete


In [10]:
# Check which files are available on HuBMAP assets server

manifest_lines = []
found_count = 0
not_found_count = 0
missing_uuid_count = 0

print(f"Checking file availability for {len(df)} datasets...")
print("-" * 60)

for idx, row in df.iterrows():
    hubmap_id = row['Found_CODEX_IDs']
    uuid = row['last_uuid']
    
    print(f"\n[{idx+1}/{len(df)}] {hubmap_id}")
    
    # Skip if UUID is missing
    if pd.isna(uuid):
        print(f"  ✗ No UUID found")
        missing_uuid_count += 1
        continue
    
    print(f"  UUID: {uuid}")
    
    # Build URLs to check
    primary_url = f"{ASSETS_BASE_URL}/{uuid}/{primary_expr_path}"
    alternative_url = f"{ASSETS_BASE_URL}/{uuid}/{alternative_expr_path}"
    
    # Check which expression file exists
    expr_file_to_use = None
    
    print(f"  Checking primary path...", end=" ")
    if check_url_exists(primary_url):
        expr_file_to_use = primary_expr_path
        print(f"✓ Found")
    else:
        print(f"✗ Not found")
        print(f"  Checking alternative path...", end=" ")
        if check_url_exists(alternative_url):
            expr_file_to_use = alternative_expr_path
            print(f"✓ Found")
        else:
            print(f"✗ Not found")
            not_found_count += 1
            continue
    
    # Add to manifest (use HuBMAP ID, not UUID)
    manifest_lines.append(f"{hubmap_id} /{expr_file_to_use}")
    manifest_lines.append(f"{hubmap_id} /{config_path}")
    found_count += 1

# Print summary
print("\n" + "=" * 60)
print(f"Summary:")
print(f"  ✓ {found_count} datasets with files available")
print(f"  ✗ {not_found_count} datasets with missing files")
print(f"  ✗ {missing_uuid_count} datasets with missing UUIDs")
print(f"  Total manifest lines: {len(manifest_lines)}")
print("=" * 60)

Checking file availability for 12 datasets...
------------------------------------------------------------

[1/12] HBM632.JSNP.578
  UUID: f3a7a9ab7797d7d217e7bab92bc24d87
  Checking primary path... ✓ Found

[2/12] HBM792.GKBV.697
  UUID: 1bafd9eca3a630ebe851c8d2fe8cfa6a
  Checking primary path... ✓ Found

[3/12] HBM887.SHVF.747
  UUID: d4e9ec618924a8d43cfe1e67c38c1447
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[4/12] HBM373.LDGF.766
  UUID: 43213991a54ce196d406707ffe2e86bd
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[5/12] HBM465.HZHH.676
  UUID: 822c9163d3be9b427dd0830f69a12305
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[6/12] HBM597.KZXW.469
  UUID: 323989e9d095fb7dce303602298adc99
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[7/12] HBM893.CCKX.496
  UUID: 37d06bb991afa2beb7b9460e746247ad
  Checking primary path... ✗ Not found
  Checking altern

In [None]:
# Save manifest to file
manifest_path = os.path.join(out_dir, "manifest_thymus.txt")

with open(manifest_path, "w") as f:
    f.write("\n".join(manifest_lines))

print(f"✓ Manifest saved to: {manifest_path}")
print(f"  Total lines: {len(manifest_lines)}")
print(f"\nTo download files, run:")
print(f"  hubmap-clt transfer {manifest_path} -d /path/to/destination/")

✓ Manifest saved to: /u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/thymus-codex-ufl/data/manifest.txt
  Total lines: 24

To download files, run:
  hubmap-clt transfer /u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/thymus-codex-ufl/data/manifest.txt -d /path/to/destination/


In [33]:
# Preview manifest contents
print("=" * 70)
print("MANIFEST PREVIEW (first 20 lines):")
print("=" * 70)

with open(manifest_path, "r") as f:
    lines = f.readlines()
    for i, line in enumerate(lines[:20], 1):
        print(f"{i:3}. {line.strip()}")
    
    if len(lines) > 20:
        print(f"\n... and {len(lines) - 20} more lines")

print("=" * 70)

MANIFEST PREVIEW (first 20 lines):
  1. HBM632.JSNP.578 /pipeline_output/expr/reg001_expr.ome.tiff
  2. HBM632.JSNP.578 /pipelineConfig.json
  3. HBM792.GKBV.697 /pipeline_output/expr/reg001_expr.ome.tiff
  4. HBM792.GKBV.697 /pipelineConfig.json
  5. HBM887.SHVF.747 /stitched/expressions/reg1_stitched_expressions.ome.tiff
  6. HBM887.SHVF.747 /pipelineConfig.json
  7. HBM373.LDGF.766 /stitched/expressions/reg1_stitched_expressions.ome.tiff
  8. HBM373.LDGF.766 /pipelineConfig.json
  9. HBM465.HZHH.676 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 10. HBM465.HZHH.676 /pipelineConfig.json
 11. HBM597.KZXW.469 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 12. HBM597.KZXW.469 /pipelineConfig.json
 13. HBM893.CCKX.496 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 14. HBM893.CCKX.496 /pipelineConfig.json
 15. HBM857.ZBDC.975 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 16. HBM857.ZBDC.975 /pipelineConfig.json
 17. HBM545.WPDT.262 /stitched/ex