# HuBMAP Manifest Creation

This notebook creates a manifest file for downloading HuBMAP CODEX datasets.

**Process:**
1. Load CSV with HuBMAP IDs
2. Fetch UUIDs for descendant datasets
3. Check which files are available on HuBMAP assets server
4. Generate manifest file for download

**Usage:**
- Update the CSV file path in cell 2
- Update the output directory in cell 7
- Run all cells sequentially

In [3]:
# Import required libraries
import numpy as np 
import pandas as pd 
import os
import requests
import re
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

In [12]:
# Load CSV file with HuBMAP IDs
# CSV should have columns: 'Input_HuBMAP_ID' and 'Found_CODEX_IDs'
csv_path = '/u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/intestine-codex-stanford/data/descendant_hubmapID_intestine.csv'
df = pd.read_csv(csv_path)

print(f"Loaded {len(df)} records from {csv_path}")

Loaded 64 records from /u/sbdubey/CLI_HUBMAP/hra-deepcell-experiments/scripts/intestine-codex-stanford/data/descendant_hubmapID_intestine.csv


In [13]:
# Inspect data structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Input_HuBMAP_ID  64 non-null     object
 1   Found_CODEX_IDs  63 non-null     object
dtypes: object(2)
memory usage: 1.1+ KB


In [14]:
# Preview data
df.head(10)

Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs
0,HBM235.VKNJ.237,HBM953.LMWQ.235
1,HBM238.GTNW.259,HBM443.XPDK.549
2,HBM242.LSCK.393,HBM292.FCMS.497
3,HBM284.SBPR.357,HBM777.TLQL.487
4,HBM285.VFDT.966,HBM795.GWKV.825
5,HBM288.BBKK.828,HBM776.FNRJ.959
6,HBM295.SWJJ.888,HBM352.MDZF.598
7,HBM297.MZZX.824,HBM423.MMGW.744
8,HBM334.QWFV.953,HBM889.KDGM.632
9,HBM353.NZVQ.793,HBM599.BFHP.494


In [7]:
# Fetch UUIDs for descendant CODEX datasets
# The Found_CODEX_IDs contain the actual datasets with the files we want to download

def get_last_uuid(hubmap_id):
    """Fetch UUID for a HuBMAP ID from the API"""
    base_url = "https://entity.api.hubmapconsortium.org/entities/"
    url = f"{base_url}{hubmap_id}"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            uuids = re.findall(r'"uuid"\s*:\s*"([a-f0-9\-]+)"', r.text)
            return uuids[-1] if uuids else None
        return None
    except Exception as e:
        print(f"  Error fetching UUID for {hubmap_id}: {e}")
        return None

print("Fetching UUIDs from HuBMAP API...")
df['last_uuid'] = [
    get_last_uuid(hid) for hid in tqdm(df['Found_CODEX_IDs'], desc="Fetching UUIDs")
]

# Show results
print(f"\n✓ UUID mapping complete")
print(f"  Total IDs: {len(df)}")
print(f"  With UUIDs: {df['last_uuid'].notna().sum()}")
print(f"  Missing UUIDs: {df['last_uuid'].isna().sum()}")

Fetching UUIDs from HuBMAP API...


Fetching UUIDs:   0%|          | 0/64 [00:00<?, ?it/s]

Fetching UUIDs: 100%|██████████| 64/64 [01:00<00:00,  1.07it/s]


✓ UUID mapping complete
  Total IDs: 64
  With UUIDs: 63
  Missing UUIDs: 1





In [8]:
df.head(23)

Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs,last_uuid
0,HBM235.VKNJ.237,HBM953.LMWQ.235,10c0c11280c00f324259fe38e2291ee4
1,HBM238.GTNW.259,HBM443.XPDK.549,8b21db2002a5179b03532d183a4885eb
2,HBM242.LSCK.393,HBM292.FCMS.497,6bdd149dc47782aefdd0e23599708183
3,HBM284.SBPR.357,HBM777.TLQL.487,550f3ef14b113c24fd21b8b0750bf078
4,HBM285.VFDT.966,HBM795.GWKV.825,aa54aad994ca8a64fa52b3f3945c01b7
5,HBM288.BBKK.828,HBM776.FNRJ.959,7b2b9029035d46c4ef8306fa82c8e58e
6,HBM295.SWJJ.888,HBM352.MDZF.598,01510a4fb90fd303bd48c4cd51cdd14c
7,HBM297.MZZX.824,HBM423.MMGW.744,3e800f0cd138b989b935fb94e7938617
8,HBM334.QWFV.953,HBM889.KDGM.632,768b7adb649959b6a4e354867595032d
9,HBM353.NZVQ.793,HBM599.BFHP.494,289c13f8ec809a7ed138262eb2a6d946


In [9]:
# Configure output directory for manifest file
out_dir = "/u/sbdubey"

if not os.path.exists(out_dir):
    print(f"Creating directory: {out_dir}")
    os.makedirs(out_dir, exist_ok=True)
else:
    print(f"✓ Directory exists: {out_dir}")

✓ Directory exists: /u/sbdubey


In [10]:
# Configure file paths and checking function

# Define possible expression file paths (will try both for each dataset)
primary_expr_path = "pipeline_output/expr/reg001_expr.ome.tiff"
alternative_expr_path = "stitched/expressions/reg1_stitched_expressions.ome.tiff"
config_path = "pipelineConfig.json"

# HuBMAP assets server base URL
ASSETS_BASE_URL = "https://assets.hubmapconsortium.org"

print("Configuration:")
print(f"  Primary path: {primary_expr_path}")
print(f"  Alternative path: {alternative_expr_path}")
print(f"  Config path: {config_path}")

def check_url_exists(url, timeout=10):
    """
    Check if a file exists on HuBMAP assets server.
    Uses GET request with byte range to avoid downloading large files.
    """
    try:
        response = requests.get(
            url, 
            timeout=timeout, 
            stream=True, 
            headers={'Range': 'bytes=0-1'}, 
            allow_redirects=True
        )
        # 200 = OK, 206 = Partial Content, 416 = Range Not Satisfiable (file exists)
        return response.status_code in [200, 206, 416]
    except Exception:
        return False

print("✓ Configuration complete")

Configuration:
  Primary path: pipeline_output/expr/reg001_expr.ome.tiff
  Alternative path: stitched/expressions/reg1_stitched_expressions.ome.tiff
  Config path: pipelineConfig.json
✓ Configuration complete


In [11]:
# Check which files are available on HuBMAP assets server

manifest_lines = []
found_count = 0
not_found_count = 0
missing_uuid_count = 0

print(f"Checking file availability for {len(df)} datasets...")
print("-" * 60)

for idx, row in df.iterrows():
    hubmap_id = row['Found_CODEX_IDs']
    uuid = row['last_uuid']
    
    print(f"\n[{idx+1}/{len(df)}] {hubmap_id}")
    
    # Skip if UUID is missing
    if pd.isna(uuid):
        print(f"  ✗ No UUID found")
        missing_uuid_count += 1
        continue
    
    print(f"  UUID: {uuid}")
    
    # Build URLs to check
    primary_url = f"{ASSETS_BASE_URL}/{uuid}/{primary_expr_path}"
    alternative_url = f"{ASSETS_BASE_URL}/{uuid}/{alternative_expr_path}"
    
    # Check which expression file exists
    expr_file_to_use = None
    
    print(f"  Checking primary path...", end=" ")
    if check_url_exists(primary_url):
        expr_file_to_use = primary_expr_path
        print(f"✓ Found")
    else:
        print(f"✗ Not found")
        print(f"  Checking alternative path...", end=" ")
        if check_url_exists(alternative_url):
            expr_file_to_use = alternative_expr_path
            print(f"✓ Found")
        else:
            print(f"✗ Not found")
            not_found_count += 1
            continue
    
    # Add to manifest (use HuBMAP ID, not UUID)
    manifest_lines.append(f"{hubmap_id} /{expr_file_to_use}")
    manifest_lines.append(f"{hubmap_id} /{config_path}")
    found_count += 1

# Print summary
print("\n" + "=" * 60)
print(f"Summary:")
print(f"  ✓ {found_count} datasets with files available")
print(f"  ✗ {not_found_count} datasets with missing files")
print(f"  ✗ {missing_uuid_count} datasets with missing UUIDs")
print(f"  Total manifest lines: {len(manifest_lines)}")
print("=" * 60)

Checking file availability for 64 datasets...
------------------------------------------------------------

[1/64] HBM953.LMWQ.235
  UUID: 10c0c11280c00f324259fe38e2291ee4
  Checking primary path... ✓ Found

[2/64] HBM443.XPDK.549
  UUID: 8b21db2002a5179b03532d183a4885eb
  Checking primary path... ✓ Found

[3/64] HBM292.FCMS.497
  UUID: 6bdd149dc47782aefdd0e23599708183
  Checking primary path... ✓ Found

[4/64] HBM777.TLQL.487
  UUID: 550f3ef14b113c24fd21b8b0750bf078
  Checking primary path... ✓ Found

[5/64] HBM795.GWKV.825
  UUID: aa54aad994ca8a64fa52b3f3945c01b7
  Checking primary path... ✓ Found

[6/64] HBM776.FNRJ.959
  UUID: 7b2b9029035d46c4ef8306fa82c8e58e
  Checking primary path... ✓ Found

[7/64] HBM352.MDZF.598
  UUID: 01510a4fb90fd303bd48c4cd51cdd14c
  Checking primary path... ✓ Found

[8/64] HBM423.MMGW.744
  UUID: 3e800f0cd138b989b935fb94e7938617
  Checking primary path... ✓ Found

[9/64] HBM889.KDGM.632
  UUID: 768b7adb649959b6a4e354867595032d
  Checking primary path... ✓

In [None]:
# Save manifest to file
manifest_path = os.path.join(out_dir, "manifest_intestine.txt")

with open(manifest_path, "w") as f:
    f.write("\n".join(manifest_lines))

print(f"✓ Manifest saved to: {manifest_path}")
print(f"  Total lines: {len(manifest_lines)}")
print(f"\nTo download files, run:")
print(f"  hubmap-clt transfer {manifest_path} -d /path/to/destination/")

✓ Manifest saved to: /u/sbdubey/manifest.txt
  Total lines: 126

To download files, run:
  hubmap-clt transfer /u/sbdubey/manifest.txt -d /path/to/destination/


In [13]:
# Preview manifest contents
print("=" * 70)
print("MANIFEST PREVIEW (first 20 lines):")
print("=" * 70)

with open(manifest_path, "r") as f:
    lines = f.readlines()
    for i, line in enumerate(lines[:20], 1):
        print(f"{i:3}. {line.strip()}")
    
    if len(lines) > 20:
        print(f"\n... and {len(lines) - 20} more lines")

print("=" * 70)

MANIFEST PREVIEW (first 20 lines):
  1. HBM953.LMWQ.235 /pipeline_output/expr/reg001_expr.ome.tiff
  2. HBM953.LMWQ.235 /pipelineConfig.json
  3. HBM443.XPDK.549 /pipeline_output/expr/reg001_expr.ome.tiff
  4. HBM443.XPDK.549 /pipelineConfig.json
  5. HBM292.FCMS.497 /pipeline_output/expr/reg001_expr.ome.tiff
  6. HBM292.FCMS.497 /pipelineConfig.json
  7. HBM777.TLQL.487 /pipeline_output/expr/reg001_expr.ome.tiff
  8. HBM777.TLQL.487 /pipelineConfig.json
  9. HBM795.GWKV.825 /pipeline_output/expr/reg001_expr.ome.tiff
 10. HBM795.GWKV.825 /pipelineConfig.json
 11. HBM776.FNRJ.959 /pipeline_output/expr/reg001_expr.ome.tiff
 12. HBM776.FNRJ.959 /pipelineConfig.json
 13. HBM352.MDZF.598 /pipeline_output/expr/reg001_expr.ome.tiff
 14. HBM352.MDZF.598 /pipelineConfig.json
 15. HBM423.MMGW.744 /pipeline_output/expr/reg001_expr.ome.tiff
 16. HBM423.MMGW.744 /pipelineConfig.json
 17. HBM889.KDGM.632 /pipeline_output/expr/reg001_expr.ome.tiff
 18. HBM889.KDGM.632 /pipelineConfig.json
 19. HBM599