In [1]:
import os
import json
import yaml
import shutil
import xml.etree.ElementTree as ET
from pathlib import Path
from tifffile import TiffFile

# Define paths

In [None]:
input_root = "/teradata/sbdubey/deepcell-experiments-data/intestine-codex-stanford/data-original/"
output_root = "/teradata/sbdubey/deepcell-experiments-data/intestine-codex-stanford/input-data/"

# Function to extract HubMAP ID

In [3]:
def get_hubmap_id(dir_name):
    # Extract HBM###.XXXX.### part and remove dots
    parts = dir_name.split('-')[0]
    return parts.replace('.', '')

# Function to extract MPP from OME-TIFF

In [4]:
def extract_mpp(tiff_path):
    with TiffFile(tiff_path) as tif:
        ome_xml = tif.ome_metadata
        
        # Parse PhysicalSizeX and unit
        if 'PhysicalSizeX="' in ome_xml:
            start = ome_xml.find('PhysicalSizeX="') + 15
            end = ome_xml.find('"', start)
            physical_size_x = float(ome_xml[start:end])
            
            start = ome_xml.find('PhysicalSizeXUnit="') + 19
            end = ome_xml.find('"', start)
            unit = ome_xml[start:end]
            
            # Convert to microns
            if unit == 'nm':
                mpp = physical_size_x / 1000
            elif unit == 'Âµm' or unit == 'um':
                mpp = physical_size_x
            elif unit == 'mm':
                mpp = physical_size_x * 1000
            elif unit == 'm':
                mpp = physical_size_x * 1000000
            else:
                mpp = physical_size_x
                
            return round(mpp, 5)
    return None

# Function to extract channel names from OME-TIFF

In [5]:
def extract_channels_from_tiff(tiff_path):
    with TiffFile(tiff_path) as tif:
        ome_xml = tif.ome_metadata
        
        # Parse OME-XML to extract channels
        root = ET.fromstring(ome_xml)
        ns = {"ome": "http://www.openmicroscopy.org/Schemas/OME/2016-06"}
        channels = [c.attrib.get("Name") for c in root.findall(".//ome:Channel", ns)]
        
        return channels

# Updated function to create YAML config

In [6]:
def create_yaml_config(json_path, tiff_path, hubmap_id):
    with open(json_path, 'r') as f:
        config = json.load(f)
    
    # Extract nucleus and cell channel names from JSON
    nucleus_channel = config['report']['reg1']['nucleus_channel']
    cell_channel = config['report']['reg1']['cell_channel']
    
    # Extract channel names directly from TIFF
    channel_names = extract_channels_from_tiff(tiff_path)
    
    # Get channel indices from the actual TIFF channels
    try:
        nucleus_idx = channel_names.index(nucleus_channel)
    except ValueError:
        raise ValueError(f"Nucleus channel '{nucleus_channel}' not found in TIFF channels")
    
    try:
        cell_idx = channel_names.index(cell_channel)
    except ValueError:
        raise ValueError(f"Cell channel '{cell_channel}' not found in TIFF channels")
    
    # Extract MPP
    mpp = extract_mpp(tiff_path)
    
    # Build YAML structure
    yaml_data = {
        'image_path': f'{hubmap_id}.ome.tiff',
        'use_wsi': True,
        'MPP': mpp,
        'channels': [
            {'name': nucleus_channel, 'number': nucleus_idx},
            {'name': cell_channel, 'number': cell_idx}
        ],
        'markers': [
            {'name': name, 'number': idx} 
            for idx, name in enumerate(channel_names)
        ]
    }
    
    return yaml_data

# Main processing loop

In [None]:
# Create output root if it doesn't exist
os.makedirs(output_root, exist_ok=True)

# Process each directory
input_dirs = [d for d in os.listdir(input_root) if os.path.isdir(os.path.join(input_root, d))]

for dir_name in input_dirs:
    print(f"Processing: {dir_name}")
    
    # Get HubMAP ID
    hubmap_id = get_hubmap_id(dir_name)
    
    # Define paths
    input_dir = os.path.join(input_root, dir_name)
    json_path = os.path.join(input_dir, 'pipelineConfig.json')
    
    # Check for either TIFF filename. (** Make sure that you check the path is it changes for some other data)
    tiff_path_1 = os.path.join(input_dir, 'reg001_expr.ome.tiff')
    tiff_path_2 = os.path.join(input_dir, 'reg1_stitched_expressions.ome.tif')
    
    if os.path.exists(tiff_path_1):
        tiff_path = tiff_path_1
    elif os.path.exists(tiff_path_2):
        tiff_path = tiff_path_2
    else:
        print(f"  ERROR: No TIFF file found")
        continue
    
    # Create output directory
    output_dir = os.path.join(output_root, hubmap_id[:13])
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        # Create YAML config
        yaml_data = create_yaml_config(json_path, tiff_path, hubmap_id)
        
        # Write YAML file
        yaml_path = os.path.join(output_dir, f'{hubmap_id}_config.yaml')
        with open(yaml_path, 'w') as f:
            yaml.dump(yaml_data, f, default_flow_style=False, sort_keys=False)
        
        # Copy TIFF file with new name
        new_tiff_path = os.path.join(output_dir, f'{hubmap_id}.ome.tiff')
        shutil.copy2(tiff_path, new_tiff_path)
        
        print(f"  Created: {hubmap_id}")
    except Exception as e:
        print(f"  ERROR: {e}")
        

print("\nProcessing complete!")

Processing: HBM233.GTZN.466-b38730b2633e0b088619f9bcd514ba13
  Created: HBM233GTZN466
Processing: HBM244.TVNH.734-168050d20802e0c0d91fd5f51ce550da
  Created: HBM244TVNH734
Processing: HBM245.NHMB.685-bc68fe67a089ab19c1449de6d0703d71
  Created: HBM245NHMB685
Processing: HBM253.MXKW.373-ac169bbda02d0c2832c01f70375ff6dc
  Created: HBM253MXKW373
Processing: HBM292.FCMS.497-6bdd149dc47782aefdd0e23599708183
  Created: HBM292FCMS497
Processing: HBM293.LGZW.236-c92332aa7e244be5bad1c27c80fcd343
  Created: HBM293LGZW236
Processing: HBM334.RPTP.997-47b8410d1c51b23e7fb1a721c53a493f
  Created: HBM334RPTP997
Processing: HBM352.MDZF.598-01510a4fb90fd303bd48c4cd51cdd14c
  Created: HBM352MDZF598
Processing: HBM396.FNQW.543-87922a42fa8bc7ab29a4d2d5374afbb4
  Created: HBM396FNQW543
Processing: HBM398.SWKV.256-ff77fcae7f6d9b5b7b8741c282677eef
  Created: HBM398SWKV256
Processing: HBM423.MMGW.744-3e800f0cd138b989b935fb94e7938617
  Created: HBM423MMGW744
Processing: HBM423.QJJR.545-b98ca5a13b6b7482fe7acbeeb1

# Verify output

In [8]:
# List created directories
output_dirs = sorted([d for d in os.listdir(output_root) if os.path.isdir(os.path.join(output_root, d))])
print(f"Total directories created: {len(output_dirs)}")
print("\nFirst 5 directories:")
for d in output_dirs[:5]:
    files = os.listdir(os.path.join(output_root, d))
    print(f"  {d}: {files}")

Total directories created: 63

First 5 directories:
  HBM233GTZN466: ['HBM233GTZN466_config.yaml', 'HBM233GTZN466.ome.tiff']
  HBM244TVNH734: ['HBM244TVNH734_config.yaml', 'HBM244TVNH734.ome.tiff']
  HBM245NHMB685: ['HBM245NHMB685_config.yaml', 'HBM245NHMB685.ome.tiff']
  HBM253MXKW373: ['HBM253MXKW373_config.yaml', 'HBM253MXKW373.ome.tiff']
  HBM292FCMS497: ['HBM292FCMS497_config.yaml', 'HBM292FCMS497.ome.tiff']
