# 01 - Data Preparation for Benchmarking

**Goal**: Download, validate, and format public IMC datasets for comparison between Steinbock and our pipeline.

## Datasets
- **Bodenmiller Example**: Small tutorial dataset from Zenodo (5949116)
- **High-res Kidney**: Tissue-matched high-resolution IMC (Zenodo 10.5281/zenodo.17077712)

## Tasks
1. Download datasets from Zenodo
2. Verify data integrity (file formats, channel counts)
3. Create metadata CSV (ROI names, conditions, timepoints)
4. Generate summary statistics
5. Prepare for both pipelines (Steinbock .txt format, our pipeline format)

In [None]:
import os
import sys
from pathlib import Path
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
import hashlib

# Add project root to path
project_root = Path().resolve().parent.parent
sys.path.insert(0, str(project_root))

# Import utilities
from src.utils.helpers import ROIMetadata

## Configuration

In [None]:
# Paths
benchmark_dir = project_root / 'benchmarks'
data_dir = benchmark_dir / 'data'

# Dataset selection
DATASET_NAME = 'bodenmiller_example'  # or 'highres_kidney_2025'
dataset_path = data_dir / DATASET_NAME

print(f"Benchmark directory: {benchmark_dir}")
print(f"Dataset path: {dataset_path}")
print(f"Dataset exists: {dataset_path.exists()}")

## Step 1: Download Dataset

**Manual step required** (Zenodo doesn't allow programmatic bulk download without API key):

### For Bodenmiller Example Dataset
1. Visit: https://zenodo.org/records/5949116
2. Download all `.txt` files (IMC raw data)
3. Place in: `benchmarks/data/bodenmiller_example/`

### For High-res Kidney Dataset
1. Visit: https://doi.org/10.5281/zenodo.17077712
2. Download kidney tissue samples (`.txt` or `.mcd` format)
3. Place in: `benchmarks/data/highres_kidney_2025/`

**Alternative**: Use download script:
```bash
cd ../../
./benchmarks/scripts/download_datasets.sh bodenmiller_example
```

## Step 2: Verify Data Integrity

In [None]:
def find_imc_files(directory: Path) -> List[Path]:
    """Find all IMC .txt files in directory."""
    return sorted(directory.glob('**/*.txt'))

def load_imc_txt(file_path: Path) -> Tuple[np.ndarray, List[str]]:
    """Load IMC .txt file and extract channel names.
    
    Returns:
        data: (height, width, channels) array
        channel_names: List of channel/marker names
    """
    with open(file_path, 'r') as f:
        # First line contains channel names
        first_line = f.readline().strip()
        if first_line.startswith('Start'):
            # Skip header lines until we find channel names
            for line in f:
                if not line.startswith('End'):
                    channel_names = line.strip().split('\t')
                    break
        else:
            channel_names = first_line.split('\t')
    
    # Load data (skip header)
    data = np.loadtxt(file_path, skiprows=len(channel_names) + 2)
    
    return data, channel_names

def compute_file_checksum(file_path: Path) -> str:
    """Compute MD5 checksum for file integrity verification."""
    md5 = hashlib.md5()
    with open(file_path, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b''):
            md5.update(chunk)
    return md5.hexdigest()

# Find all IMC files
imc_files = find_imc_files(dataset_path)
print(f"Found {len(imc_files)} IMC .txt files")

if len(imc_files) == 0:
    print("\n⚠️  No .txt files found. Please download dataset first (see Step 1).")
else:
    print("\nFirst 5 files:")
    for f in imc_files[:5]:
        print(f"  - {f.name}")

In [None]:
# Validate files and collect metadata
file_metadata = []

for imc_file in imc_files:
    try:
        data, channels = load_imc_txt(imc_file)
        checksum = compute_file_checksum(imc_file)
        
        metadata = {
            'filename': imc_file.name,
            'roi_name': imc_file.stem,  # Filename without extension
            'file_size_mb': imc_file.stat().st_size / (1024 * 1024),
            'n_channels': len(channels),
            'image_shape': data.shape if len(data.shape) == 2 else (data.shape[0], data.shape[1]),
            'n_pixels': data.size,
            'checksum': checksum,
            'channels': channels
        }
        file_metadata.append(metadata)
        
    except Exception as e:
        print(f"Error loading {imc_file.name}: {e}")

# Convert to DataFrame for analysis
metadata_df = pd.DataFrame([{k: v for k, v in m.items() if k != 'channels'} 
                            for m in file_metadata])

print(f"\n✅ Successfully loaded {len(metadata_df)} files")
print(f"\nSummary Statistics:")
print(metadata_df.describe())

In [None]:
# Check channel consistency
all_channels = [set(m['channels']) for m in file_metadata]
common_channels = set.intersection(*all_channels) if all_channels else set()
unique_channels = set.union(*all_channels) if all_channels else set()

print(f"Channel Analysis:")
print(f"  Common to all ROIs: {len(common_channels)} channels")
print(f"  Unique across all ROIs: {len(unique_channels)} channels")

if len(common_channels) < len(unique_channels):
    print("\n⚠️  Warning: Not all ROIs have same channel set")
    print("  Missing channels per ROI:")
    for m in file_metadata:
        missing = unique_channels - set(m['channels'])
        if missing:
            print(f"    {m['roi_name']}: {missing}")
else:
    print("\n✅ All ROIs have consistent channel set")

print(f"\nChannel List:")
for i, channel in enumerate(sorted(common_channels), 1):
    print(f"  {i:2d}. {channel}")

## Step 3: Create Metadata CSV

Create metadata mapping ROI names to experimental conditions.

In [None]:
# Extract experimental metadata from filenames if possible
# Common IMC naming: <Study>_<Sample>_<ROI>_<Acquisition>.txt

def parse_roi_name(filename: str) -> Dict[str, str]:
    """Attempt to parse experimental info from filename.
    
    This is dataset-specific and may need adjustment.
    """
    # Remove extension
    roi_name = Path(filename).stem
    
    # Example: IMC_241218_Alun_ROI_D7_M2_03_26
    parts = roi_name.split('_')
    
    # Generic fallback
    return {
        'roi_name': roi_name,
        'condition': 'unknown',
        'timepoint': 'unknown',
        'sample_id': 'unknown',
        'region': 'unknown'
    }

# Create metadata dataframe
roi_metadata_list = [parse_roi_name(m['filename']) for m in file_metadata]
roi_metadata_df = pd.DataFrame(roi_metadata_list)

print("Parsed ROI Metadata:")
print(roi_metadata_df.head(10))

print("\n⚠️  Review parsed metadata and manually correct if needed")
print("   Save corrected metadata to: benchmarks/data/<dataset>/metadata.csv")

In [None]:
# Save metadata template
metadata_output = dataset_path / 'metadata_parsed.csv'
roi_metadata_df.to_csv(metadata_output, index=False)
print(f"✅ Saved metadata to: {metadata_output}")

print("\nNext steps:")
print("1. Review metadata_parsed.csv")
print("2. Manually correct condition/timepoint/sample_id columns")
print("3. Rename to metadata.csv when complete")

## Step 4: Generate Summary Statistics

In [None]:
# Create comprehensive dataset summary
dataset_summary = {
    'dataset_name': DATASET_NAME,
    'n_rois': len(file_metadata),
    'n_channels': len(common_channels),
    'channel_names': sorted(list(common_channels)),
    'total_size_mb': sum(m['file_size_mb'] for m in file_metadata),
    'image_dimensions': {
        'min_width': int(metadata_df['image_shape'].apply(lambda x: x[0]).min()),
        'max_width': int(metadata_df['image_shape'].apply(lambda x: x[0]).max()),
        'min_height': int(metadata_df['image_shape'].apply(lambda x: x[1]).min()),
        'max_height': int(metadata_df['image_shape'].apply(lambda x: x[1]).max()),
    },
    'total_pixels': int(metadata_df['n_pixels'].sum()),
    'file_checksums': {m['filename']: m['checksum'] for m in file_metadata}
}

# Save summary
summary_output = dataset_path / 'dataset_summary.json'
with open(summary_output, 'w') as f:
    json.dump(dataset_summary, f, indent=2)

print(f"Dataset Summary:")
print(f"  ROIs: {dataset_summary['n_rois']}")
print(f"  Channels: {dataset_summary['n_channels']}")
print(f"  Total size: {dataset_summary['total_size_mb']:.1f} MB")
print(f"  Image dimensions: {dataset_summary['image_dimensions']}")
print(f"\n✅ Saved summary to: {summary_output}")

## Step 5: Prepare for Both Pipelines

### Steinbock Format
Steinbock expects:
- `.txt` files in `img/` directory (already have this)
- Optional `panel.csv` with channel metadata
- Run via Docker wrapper script

### Our Pipeline Format
Our pipeline expects:
- `.txt` files (same format)
- `metadata.csv` with ROI annotations
- `config.json` with analysis parameters

In [None]:
# Create Steinbock panel.csv
panel_data = []
for i, channel in enumerate(sorted(common_channels), 1):
    # Attempt to parse metal and target from channel name
    # Format often: <Metal><Mass>Di_<Target> or <Target>_<Metal><Mass>
    if 'Di' in channel:
        parts = channel.split('Di')
        metal = parts[0] + 'Di'
        target = parts[1].strip('_')
    else:
        metal = f"Channel{i}"
        target = channel
    
    panel_data.append({
        'channel': i,
        'name': channel,
        'metal': metal,
        'target': target,
        'keep': 1  # Keep all channels initially
    })

panel_df = pd.DataFrame(panel_data)
panel_output = dataset_path / 'panel.csv'
panel_df.to_csv(panel_output, index=False)

print(f"✅ Created Steinbock panel.csv: {panel_output}")
print(panel_df.head(10))

In [None]:
# Create benchmark config for our pipeline
benchmark_config = {
    "dataset_name": DATASET_NAME,
    "data_directory": str(dataset_path),
    "output_directory": str(benchmark_dir / 'our_outputs' / DATASET_NAME),
    
    "preprocessing": {
        "arcsinh_cofactor": 5,
        "background_subtraction": False
    },
    
    "segmentation": {
        "method": "slic",
        "scales_um": [10, 20, 40],
        "dna_channels": ["Ir191Di", "Ir193Di"],  # Update based on actual panel
        "compactness": 0.1,
        "sigma": 1.0
    },
    
    "spatial_analysis": {
        "k_neighbors": 10,
        "n_permutations": 500,
        "alpha": 0.05
    },
    
    "clustering": {
        "method": "kmeans",
        "n_clusters_range": [5, 20],
        "optimize": True
    },
    
    "output_formats": {
        "roi_results": "hdf5",
        "summary": "json",
        "spatial_graphs": "parquet"
    }
}

config_output = benchmark_dir / 'configs' / f'{DATASET_NAME}_config.json'
config_output.parent.mkdir(exist_ok=True)
with open(config_output, 'w') as f:
    json.dump(benchmark_config, f, indent=2)

print(f"✅ Created benchmark config: {config_output}")
print("\n⚠️  Review and update:")
print("   - DNA channels (based on actual panel)")
print("   - Cell type gating thresholds (if needed)")
print("   - Output paths")

## Summary and Next Steps

In [None]:
print("="*60)
print("DATA PREPARATION COMPLETE")
print("="*60)

print(f"\nDataset: {DATASET_NAME}")
print(f"  Location: {dataset_path}")
print(f"  ROIs: {len(file_metadata)}")
print(f"  Channels: {len(common_channels)}")
print(f"  Total size: {dataset_summary['total_size_mb']:.1f} MB")

print("\nCreated Files:")
print(f"  ✅ {metadata_output.name} - ROI metadata (review and rename)")
print(f"  ✅ {summary_output.name} - Dataset summary with checksums")
print(f"  ✅ {panel_output.name} - Steinbock panel file")
print(f"  ✅ {config_output.name} - Our pipeline config")

print("\nNext Steps:")
print("  1. Review and correct metadata_parsed.csv → rename to metadata.csv")
print("  2. Update benchmark config (DNA channels, parameters)")
print("  3. Run Steinbock pipeline:")
print(f"     cd {project_root}")
print(f"     ./benchmarks/scripts/run_steinbock_docker.sh {dataset_path}")
print("  4. Run our pipeline:")
print(f"     python run_analysis.py --config {config_output}")
print("  5. Proceed to comparison notebook: 04_quantitative_comparison.ipynb")

print("\n" + "="*60)