# HuBMAP Manifest Creation

This notebook creates a manifest file for downloading HuBMAP CODEX datasets.

**Process:**
1. Load CSV with HuBMAP IDs
2. Fetch UUIDs for descendant datasets
3. Check which files are available on HuBMAP assets server
4. Generate manifest file for download

**Usage:**
- Update the CSV file path in cell 2
- Update the output directory in cell 7
- Run all cells sequentially

In [11]:
# Import required libraries
import numpy as np 
import pandas as pd 
import os
import requests
import re
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

In [12]:
# Load CSV file with HuBMAP IDs
# CSV should have columns: 'Input_HuBMAP_ID' and 'Found_CODEX_IDs'
csv_path = '/u/sbdubey/descendant_hubmapID_lymph_01.csv'
df = pd.read_csv(csv_path)

print(f"Loaded {len(df)} records from {csv_path}")

Loaded 23 records from /u/sbdubey/descendant_hubmapID_lymph_01.csv


In [13]:
# Inspect data structure
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Input_HuBMAP_ID  23 non-null     object
 1   Found_CODEX_IDs  23 non-null     object
dtypes: object(2)
memory usage: 500.0+ bytes


In [14]:
# Preview data
df.head(10)

Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs
0,HBM723.BZKF.992,HBM332.QJCH.296
1,HBM927.KDXB.445,HBM435.FHMP.758
2,HBM283.GNGR.785,HBM387.QZQF.984
3,HBM483.KDBW.482,HBM673.TNZM.339
4,HBM457.JZFF.434,HBM443.FDHZ.888
5,HBM754.WKLP.262,HBM938.TNNT.879
6,HBM834.ZFVJ.978,HBM522.BSZT.385
7,HBM746.ZTBR.275,HBM622.JXWQ.554
8,HBM866.TGNJ.847,HBM268.NKXB.243
9,HBM958.GGVQ.546,HBM992.RHJW.288


In [15]:
# Fetch UUIDs for descendant CODEX datasets
# The Found_CODEX_IDs contain the actual datasets with the files we want to download

def get_last_uuid(hubmap_id):
    """Fetch UUID for a HuBMAP ID from the API"""
    base_url = "https://entity.api.hubmapconsortium.org/entities/"
    url = f"{base_url}{hubmap_id}"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            uuids = re.findall(r'"uuid"\s*:\s*"([a-f0-9\-]+)"', r.text)
            return uuids[-1] if uuids else None
        return None
    except Exception as e:
        print(f"  Error fetching UUID for {hubmap_id}: {e}")
        return None

print("Fetching UUIDs from HuBMAP API...")
df['last_uuid'] = [
    get_last_uuid(hid) for hid in tqdm(df['Found_CODEX_IDs'], desc="Fetching UUIDs")
]

# Show results
print(f"\n✓ UUID mapping complete")
print(f"  Total IDs: {len(df)}")
print(f"  With UUIDs: {df['last_uuid'].notna().sum()}")
print(f"  Missing UUIDs: {df['last_uuid'].isna().sum()}")

Fetching UUIDs from HuBMAP API...


Fetching UUIDs: 100%|██████████| 23/23 [00:03<00:00,  5.91it/s]


✓ UUID mapping complete
  Total IDs: 23
  With UUIDs: 23
  Missing UUIDs: 0





In [16]:
df.head(23)

Unnamed: 0,Input_HuBMAP_ID,Found_CODEX_IDs,last_uuid
0,HBM723.BZKF.992,HBM332.QJCH.296,34f12dab3a66443d4d844cf8d2c88b43
1,HBM927.KDXB.445,HBM435.FHMP.758,6b63f2b54b33f7a9140585fe2a63ea6f
2,HBM283.GNGR.785,HBM387.QZQF.984,89d059d815ce9c7858b4b47263ddc6ef
3,HBM483.KDBW.482,HBM673.TNZM.339,0188cbfed79e0714b0e6ea6466ab8977
4,HBM457.JZFF.434,HBM443.FDHZ.888,608f6ca28ad2e6317ed8f1577bd51014
5,HBM754.WKLP.262,HBM938.TNNT.879,4fc4e11da541a395a165016b7a3d5124
6,HBM834.ZFVJ.978,HBM522.BSZT.385,c6cf01203e651c120529bf29baad9c7b
7,HBM746.ZTBR.275,HBM622.JXWQ.554,13831dc529085f18ba34e7d29bd41db4
8,HBM866.TGNJ.847,HBM268.NKXB.243,130df85d80c6e9adcbfdf8e374bfa163
9,HBM958.GGVQ.546,HBM992.RHJW.288,214f19e2921c5d4108d905b9d30da556


In [17]:
# Configure output directory for manifest file
out_dir = "/u/sbdubey"

if not os.path.exists(out_dir):
    print(f"Creating directory: {out_dir}")
    os.makedirs(out_dir, exist_ok=True)
else:
    print(f"✓ Directory exists: {out_dir}")

✓ Directory exists: /u/sbdubey


In [18]:
# Configure file paths and checking function

# Define possible expression file paths (will try both for each dataset)
primary_expr_path = "pipeline_output/expr/reg001_expr.ome.tiff"
alternative_expr_path = "stitched/expressions/reg1_stitched_expressions.ome.tiff"
config_path = "pipelineConfig.json"

# HuBMAP assets server base URL
ASSETS_BASE_URL = "https://assets.hubmapconsortium.org"

print("Configuration:")
print(f"  Primary path: {primary_expr_path}")
print(f"  Alternative path: {alternative_expr_path}")
print(f"  Config path: {config_path}")

def check_url_exists(url, timeout=10):
    """
    Check if a file exists on HuBMAP assets server.
    Uses GET request with byte range to avoid downloading large files.
    """
    try:
        response = requests.get(
            url, 
            timeout=timeout, 
            stream=True, 
            headers={'Range': 'bytes=0-1'}, 
            allow_redirects=True
        )
        # 200 = OK, 206 = Partial Content, 416 = Range Not Satisfiable (file exists)
        return response.status_code in [200, 206, 416]
    except Exception:
        return False

print("✓ Configuration complete")

Configuration:
  Primary path: pipeline_output/expr/reg001_expr.ome.tiff
  Alternative path: stitched/expressions/reg1_stitched_expressions.ome.tiff
  Config path: pipelineConfig.json
✓ Configuration complete


In [19]:
# Check which files are available on HuBMAP assets server

manifest_lines = []
found_count = 0
not_found_count = 0
missing_uuid_count = 0

print(f"Checking file availability for {len(df)} datasets...")
print("-" * 60)

for idx, row in df.iterrows():
    hubmap_id = row['Found_CODEX_IDs']
    uuid = row['last_uuid']
    
    print(f"\n[{idx+1}/{len(df)}] {hubmap_id}")
    
    # Skip if UUID is missing
    if pd.isna(uuid):
        print(f"  ✗ No UUID found")
        missing_uuid_count += 1
        continue
    
    print(f"  UUID: {uuid}")
    
    # Build URLs to check
    primary_url = f"{ASSETS_BASE_URL}/{uuid}/{primary_expr_path}"
    alternative_url = f"{ASSETS_BASE_URL}/{uuid}/{alternative_expr_path}"
    
    # Check which expression file exists
    expr_file_to_use = None
    
    print(f"  Checking primary path...", end=" ")
    if check_url_exists(primary_url):
        expr_file_to_use = primary_expr_path
        print(f"✓ Found")
    else:
        print(f"✗ Not found")
        print(f"  Checking alternative path...", end=" ")
        if check_url_exists(alternative_url):
            expr_file_to_use = alternative_expr_path
            print(f"✓ Found")
        else:
            print(f"✗ Not found")
            not_found_count += 1
            continue
    
    # Add to manifest (use HuBMAP ID, not UUID)
    manifest_lines.append(f"{hubmap_id} /{expr_file_to_use}")
    manifest_lines.append(f"{hubmap_id} /{config_path}")
    found_count += 1

# Print summary
print("\n" + "=" * 60)
print(f"Summary:")
print(f"  ✓ {found_count} datasets with files available")
print(f"  ✗ {not_found_count} datasets with missing files")
print(f"  ✗ {missing_uuid_count} datasets with missing UUIDs")
print(f"  Total manifest lines: {len(manifest_lines)}")
print("=" * 60)

Checking file availability for 23 datasets...
------------------------------------------------------------

[1/23] HBM332.QJCH.296
  UUID: 34f12dab3a66443d4d844cf8d2c88b43
  Checking primary path... ✓ Found

[2/23] HBM435.FHMP.758
  UUID: 6b63f2b54b33f7a9140585fe2a63ea6f
  Checking primary path... ✓ Found

[3/23] HBM387.QZQF.984
  UUID: 89d059d815ce9c7858b4b47263ddc6ef
  Checking primary path... ✓ Found

[4/23] HBM673.TNZM.339
  UUID: 0188cbfed79e0714b0e6ea6466ab8977
  Checking primary path... ✓ Found

[5/23] HBM443.FDHZ.888
  UUID: 608f6ca28ad2e6317ed8f1577bd51014
  Checking primary path... ✓ Found

[6/23] HBM938.TNNT.879
  UUID: 4fc4e11da541a395a165016b7a3d5124
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[7/23] HBM522.BSZT.385
  UUID: c6cf01203e651c120529bf29baad9c7b
  Checking primary path... ✗ Not found
  Checking alternative path... ✓ Found

[8/23] HBM622.JXWQ.554
  UUID: 13831dc529085f18ba34e7d29bd41db4
  Checking primary path... ✗ Not found
  C

In [None]:
# Save manifest to file
manifest_path = os.path.join(out_dir, "manifest_lymph.txt")

with open(manifest_path, "w") as f:
    f.write("\n".join(manifest_lines))

print(f"✓ Manifest saved to: {manifest_path}")
print(f"  Total lines: {len(manifest_lines)}")
print(f"\nTo download files, run:")
print(f"  hubmap-clt transfer {manifest_path} -d /path/to/destination/")

✓ Manifest saved to: /u/sbdubey/manifest.txt
  Total lines: 46

To download files, run:
  hubmap-clt transfer /u/sbdubey/manifest.txt -d /path/to/destination/


In [21]:
# Preview manifest contents
print("=" * 70)
print("MANIFEST PREVIEW (first 20 lines):")
print("=" * 70)

with open(manifest_path, "r") as f:
    lines = f.readlines()
    for i, line in enumerate(lines[:20], 1):
        print(f"{i:3}. {line.strip()}")
    
    if len(lines) > 20:
        print(f"\n... and {len(lines) - 20} more lines")

print("=" * 70)

MANIFEST PREVIEW (first 20 lines):
  1. HBM332.QJCH.296 /pipeline_output/expr/reg001_expr.ome.tiff
  2. HBM332.QJCH.296 /pipelineConfig.json
  3. HBM435.FHMP.758 /pipeline_output/expr/reg001_expr.ome.tiff
  4. HBM435.FHMP.758 /pipelineConfig.json
  5. HBM387.QZQF.984 /pipeline_output/expr/reg001_expr.ome.tiff
  6. HBM387.QZQF.984 /pipelineConfig.json
  7. HBM673.TNZM.339 /pipeline_output/expr/reg001_expr.ome.tiff
  8. HBM673.TNZM.339 /pipelineConfig.json
  9. HBM443.FDHZ.888 /pipeline_output/expr/reg001_expr.ome.tiff
 10. HBM443.FDHZ.888 /pipelineConfig.json
 11. HBM938.TNNT.879 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 12. HBM938.TNNT.879 /pipelineConfig.json
 13. HBM522.BSZT.385 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 14. HBM522.BSZT.385 /pipelineConfig.json
 15. HBM622.JXWQ.554 /stitched/expressions/reg1_stitched_expressions.ome.tiff
 16. HBM622.JXWQ.554 /pipelineConfig.json
 17. HBM268.NKXB.243 /stitched/expressions/reg1_stitched_expressions.ome.ti