In [1]:
import cv2
import numpy as np
import os
import pandas as pd
import tifffile
from matplotlib import pyplot as plt

In [2]:
# ==================== Settings ====================#

# File paths
dataset = "MERSCOPE_WT_1"
data_path = f"../data/{dataset}/"
output_path = f"../output/{dataset}/"

# Transformation parameters
pixel_size = 0.10799861
x_shift = int(-266.1734)
y_shift = int(180.2510)

In [3]:
# All DAPI images
files = os.listdir(data_path + "raw_data/DAPI_images/")
files = [i for i in files if i.startswith("mosaic")]
files.sort()
files

['mosaic_DAPI_z0.tif',
 'mosaic_DAPI_z1.tif',
 'mosaic_DAPI_z2.tif',
 'mosaic_DAPI_z3.tif',
 'mosaic_DAPI_z4.tif',
 'mosaic_DAPI_z5.tif',
 'mosaic_DAPI_z6.tif']

In [None]:
# Read transcripts
transcripts = pd.read_csv(data_path + "raw_data/transcripts.csv")
transcripts = transcripts[["cell_id", "gene", "global_x", "global_y", "global_z"]].copy()
transcripts.head()

In [None]:
# Define target genes
all_genes = pd.read_csv(data_path + "processed_data/genes.csv")
all_genes = all_genes["genes"].tolist()

granule_markers = ["Camk2a", "Cplx2", "Slc17a7", "Ddn", "Syp", "Map1a", "Shank1", "Syn1", "Gria1", "Gria2", "Cyfip2", "Vamp2", "Bsn", "Slc32a1", "Nfasc", "Syt1", "Tubb3", "Nav1", "Shank3", "Mapt"]

nc_markers = pd.read_csv(data_path + "processed_data/negative_controls.csv")
nc_markers = nc_markers["Gene"].tolist()

In [None]:
# # Main operations on transcripts

# # Compute DAPI pixel coordinates
# transcripts["row"] = (transcripts["global_y"] / pixel_size).astype(int) + y_shift
# transcripts["col"] = (transcripts["global_x"] / pixel_size).astype(int) + x_shift

# # Add default overlap column
# transcripts["overlaps_nucleus"] = 0

# # Update labels in place
# global_ratio = []

# for j, fname in enumerate(files):
    
#     # Load DAPI image
#     img = tifffile.imread(f"raw_data/DAPI_images/{fname}")
#     img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    
#     # Threshold and dilate
#     th = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 49, -1)
#     th = cv2.dilate(th, np.ones((3, 3), np.uint8), iterations=2)

#     # Save resized visualization (optional)
#     th_small = cv2.resize(th, (3500, 5000), interpolation=cv2.INTER_AREA)
#     cv2.imwrite(f"intermediate_data/images/z_{j}_small.png", th_small)
    
#     # Select transcripts in this z-layer
#     trans_z_mask = transcripts["global_z"] == j
#     trans_z = transcripts[trans_z_mask].copy()
#     row_vals = trans_z["row"].astype(int).values
#     col_vals = trans_z["col"].astype(int).values

#     # Avoid out-of-bounds indexing
#     height, width = th.shape
#     valid = (row_vals >= 0) & (row_vals < height) & (col_vals >= 0) & (col_vals < width)
#     row_valid = row_vals[valid]
#     col_valid = col_vals[valid]
    
#     # Assign in-nucleus labels
#     overlaps = np.zeros(len(trans_z), dtype=int)
#     overlaps[valid] = (th[row_valid, col_valid] != 0).astype(int)

#     # Update main DataFrame in-place
#     transcripts.loc[trans_z.index, "overlaps_nucleus"] = overlaps

#     # Track global ratio
#     global_ratio.append(overlaps.mean())
#     print(f"Iteration {j+1}: {np.sum(row_vals != row_valid)} row mismatches, {np.sum(col_vals != col_valid)} column mismatches, {overlaps.mean():.2%} in-nucleus")

In [None]:
# # Final labeled transcripts
# transcripts = transcripts[["cell_id", "overlaps_nucleus", "gene", "global_x", "global_y", "global_z"]].copy()
# transcripts["global_z"] *= 1.5
# transcripts = transcripts.rename(columns = {"gene": "target"})
# transcripts.to_parquet("intermediate_data/transcripts.parquet")
# transcripts.head()

In [4]:
# ==================== Initial Nuclei Mask Analysis (No Dilation) ====================#
# MEMORY-EFFICIENT VERSION - Copy this into your notebook cell

import gc

# Read all DAPI images from different z-layers (memory-efficient incremental processing)
print(f"Reading all DAPI images from {len(files)} z-layers...\n")

# First pass: get image dimensions
print("First pass: Getting image dimensions...")
first_img_path = os.path.join(data_path, "raw_data/DAPI_images", files[0])
first_img = tifffile.imread(first_img_path)
first_img = cv2.normalize(first_img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
height, width = first_img.shape
print(f"Image shape: {height} x {width}\n")

# Initialize accumulators for memory-efficient processing
non_zero_count = np.zeros((height, width), dtype=np.uint8)
mip_accumulator = first_img.copy().astype(np.float32)
mean_accumulator = first_img.copy().astype(np.float32)
median_list = [first_img.astype(np.float32)]

# Process first image
th_first = cv2.adaptiveThreshold(first_img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 49, -1)
non_zero_count += (th_first > 0).astype(np.uint8)

# Process remaining images incrementally (one at a time to save memory)
print("Processing images incrementally to save memory...")
for i, fname in enumerate(files[1:], start=1):
    print(f"  Processing layer {i+1}/{len(files)}: {fname}")
    img_path = os.path.join(data_path, "raw_data/DAPI_images", fname)
    img = tifffile.imread(img_path)
    img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    
    # Update accumulators
    th = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 49, -1)
    non_zero_count += (th > 0).astype(np.uint8)
    mip_accumulator = np.maximum(mip_accumulator, img.astype(np.float32))
    mean_accumulator += img.astype(np.float32)
    median_list.append(img.astype(np.float32))
    
    # Free memory immediately
    del img, th
    if i % 2 == 0:  # Force garbage collection every 2 images
        gc.collect()

# Finalize accumulators
mean_accumulator /= len(files)
mip_accumulator = mip_accumulator.astype(np.uint8)
mean_accumulator = mean_accumulator.astype(np.uint8)

# Compute median (still requires stacking, but only for median)
print("Computing median projection...")
median_accumulator = np.median(np.stack(median_list, axis=0), axis=0).astype(np.uint8)
del median_list, first_img, th_first
gc.collect()

print(f"Completed processing all {len(files)} images\n")
num_layers = len(files)

# Define stacking strategies (now using pre-computed accumulators)
strategies = {
    "At least 1 layer": {
        "mask": lambda: (non_zero_count >= 1).astype(np.uint8) * 255,
        "description": "Pixel is 1 if non-zero in at least 1 layer"
    },
    "At least 2 layers": {
        "mask": lambda: (non_zero_count >= 2).astype(np.uint8) * 255,
        "description": "Pixel is 1 if non-zero in at least 2 layers"
    },
    "At least 3 layers": {
        "mask": lambda: (non_zero_count >= 3).astype(np.uint8) * 255,
        "description": "Pixel is 1 if non-zero in at least 3 layers"
    },
    "At least 4 layers": {
        "mask": lambda: (non_zero_count >= 4).astype(np.uint8) * 255,
        "description": "Pixel is 1 if non-zero in at least 4 layers"
    },
    "Maximum Intensity Projection (MIP)": {
        "mask": lambda: cv2.adaptiveThreshold(
            mip_accumulator,
            255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 49, -1
        ),
        "description": "Take max across layers, then threshold"
    },
    "Mean Intensity Projection": {
        "mask": lambda: cv2.adaptiveThreshold(
            mean_accumulator,
            255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 49, -1
        ),
        "description": "Take mean across layers, then threshold"
    },
    "Median Intensity Projection": {
        "mask": lambda: cv2.adaptiveThreshold(
            median_accumulator,
            255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 49, -1
        ),
        "description": "Take median across layers, then threshold"
    }
}

# Calculate downsampling dimensions (once, since all images have same size)
if height < width:
    scale_factor = 5000 / height
    new_height = 5000
    new_width = int(width * scale_factor)
else:
    scale_factor = 5000 / width
    new_width = 5000
    new_height = int(height * scale_factor)

# Create output directory
output_dir = os.path.join(data_path, "intermediate_data")
os.makedirs(output_dir, exist_ok=True)

# Process each strategy
results = []
for strategy_name, strategy_info in strategies.items():
    print("="*80)
    print(f"STRATEGY: {strategy_name}")
    print(f"Description: {strategy_info['description']}")
    print("="*80)
    
    # Generate merged mask
    merged_mask = strategy_info["mask"]()
    
    # Detect individual nuclei masks using findContours
    contours, _ = cv2.findContours(merged_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Calculate areas for each detected nucleus
    nuclei_areas_pixels = []
    nuclei_areas_um2 = []
    nuclei_radii_um = []
    
    for contour in contours:
        area_pixels = cv2.contourArea(contour)
        if area_pixels > 0:  # Filter out tiny artifacts
            nuclei_areas_pixels.append(area_pixels)
            area_um2 = area_pixels * (pixel_size ** 2)
            nuclei_areas_um2.append(area_um2)
            # Approximate radius assuming circular shape: area = π * r^2
            radius_um = np.sqrt(area_um2 / np.pi)
            nuclei_radii_um.append(radius_um)
    
    # Calculate statistics
    if len(nuclei_areas_pixels) > 0:
        print(f"Number of detected nuclei: {len(nuclei_areas_pixels)}")
        print(f"\nArea statistics (pixels²):")
        print(f"  Mean: {np.mean(nuclei_areas_pixels):.1f}")
        print(f"  Median: {np.median(nuclei_areas_pixels):.1f}")
        print(f"  Min: {np.min(nuclei_areas_pixels):.1f}")
        print(f"  Max: {np.max(nuclei_areas_pixels):.1f}")
        print(f"  Std: {np.std(nuclei_areas_pixels):.1f}")
        
        print(f"\nArea statistics (μm²):")
        print(f"  Mean: {np.mean(nuclei_areas_um2):.2f}")
        print(f"  Median: {np.median(nuclei_areas_um2):.2f}")
        print(f"  Min: {np.min(nuclei_areas_um2):.2f}")
        print(f"  Max: {np.max(nuclei_areas_um2):.2f}")
        print(f"  Std: {np.std(nuclei_areas_um2):.2f}")
        
        print(f"\nRadius statistics (μm, approximated as circle):")
        print(f"  Mean: {np.mean(nuclei_radii_um):.2f}")
        print(f"  Median: {np.median(nuclei_radii_um):.2f}")
        print(f"  Min: {np.min(nuclei_radii_um):.2f}")
        print(f"  Max: {np.max(nuclei_radii_um):.2f}")
        print(f"  Std: {np.std(nuclei_radii_um):.2f}")
        
        # Store results
        results.append({
            "strategy": strategy_name,
            "num_nuclei": len(nuclei_areas_pixels),
            "mean_area_pixels": np.mean(nuclei_areas_pixels),
            "median_area_pixels": np.median(nuclei_areas_pixels),
            "mean_area_um2": np.mean(nuclei_areas_um2),
            "median_area_um2": np.median(nuclei_areas_um2),
            "mean_radius_um": np.mean(nuclei_radii_um),
            "median_radius_um": np.median(nuclei_radii_um)
        })
    else:
        print("No nuclei detected!")
        results.append({
            "strategy": strategy_name,
            "num_nuclei": 0,
            "mean_area_pixels": 0,
            "median_area_pixels": 0,
            "mean_area_um2": 0,
            "median_area_um2": 0,
            "mean_radius_um": 0,
            "median_radius_um": 0
        })
    
    # Save downsampled merged mask
    merged_mask_downsampled = cv2.resize(merged_mask, (new_width, new_height), interpolation=cv2.INTER_AREA)
    safe_filename = strategy_name.lower().replace(" ", "_").replace("(", "").replace(")", "").replace(",", "")
    output_path = os.path.join(output_dir, f"merged_mask_{safe_filename}_downsampled.png")
    cv2.imwrite(output_path, merged_mask_downsampled)
    print(f"\nDownsampled merged mask saved to: {output_path}")
    print(f"Downsampled size: {new_height} x {new_width} pixels\n")
    
    # Free memory after processing each strategy
    del merged_mask, merged_mask_downsampled, contours
    gc.collect()

# Clean up large accumulators
del non_zero_count, mip_accumulator, mean_accumulator, median_accumulator
gc.collect()

# Print summary table
print("="*80)
print("SUMMARY OF ALL STRATEGIES")
print("="*80)
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))
print("\n" + "="*80)

# Save summary to CSV
summary_path = os.path.join(output_dir, "stacking_strategies_summary.csv")
results_df.to_csv(summary_path, index=False)
print(f"Summary saved to: {summary_path}")


Reading all DAPI images from 7 z-layers...

First pass: Getting image dimensions...
Image shape: 104161 x 70528

Processing images incrementally to save memory...
  Processing layer 2/7: mosaic_DAPI_z1.tif


: 

In [20]:
np.median([i for i in nuclei_areas_pixels if i > 1000])

1839.5

In [None]:
# ==================== Dilation Benchmarking ====================#

# Benchmark settings
kernel_sizes = [3, 5]
iterations_list = [2, 4, 6, 8, 10]

# Store results
results = []

print("Starting dilation benchmarking...\n")

for kernel_size in kernel_sizes:
    for iterations in iterations_list:
        print(f"Testing: {kernel_size}x{kernel_size} kernel, {iterations} iterations")
        
        # Initialize counters for this setting
        all_areas = []  # Store areas of all detected nuclei masks
        all_transcripts_overlap = []  # Track overlap for all transcripts
        granule_overlap = []  # Track overlap for granule markers
        nc_overlap = []  # Track overlap for negative controls
        
        # Process each z-layer
        for j, fname in enumerate(files):
            # Load DAPI image
            img_path = os.path.join(data_path, "raw_data/DAPI_images", fname)
            img = tifffile.imread(img_path)
            img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
            
            # Threshold and dilate with current settings
            th = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 49, -1)
            kernel = np.ones((kernel_size, kernel_size), np.uint8)
            th = cv2.dilate(th, kernel, iterations=iterations)
            
            # Detect individual nuclei masks using findContours
            contours, _ = cv2.findContours(th, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            # Calculate areas for each detected nucleus
            for contour in contours:
                area_pixels = cv2.contourArea(contour)
                if area_pixels > 0:  # Filter out tiny artifacts
                    all_areas.append(area_pixels)
            
            # Map transcripts to this z-layer and check overlap
            trans_z_mask = transcripts["global_z"] == j
            trans_z = transcripts[trans_z_mask].copy()
            row_vals = trans_z["row"].astype(int).values
            col_vals = trans_z["col"].astype(int).values
            
            # Avoid out-of-bounds indexing
            height, width = th.shape
            valid = (row_vals >= 0) & (row_vals < height) & (col_vals >= 0) & (col_vals < width)
            row_valid = row_vals[valid]
            col_valid = col_vals[valid]
            
            # Check overlap for valid transcripts
            overlaps = np.zeros(len(trans_z), dtype=int)
            overlaps[valid] = (th[row_valid, col_valid] != 0).astype(int)
            
            # Store overlap information
            for idx, overlap in enumerate(overlaps):
                gene = trans_z.iloc[idx]["gene"]
                all_transcripts_overlap.append(overlap)
                
                if gene in granule_markers:
                    granule_overlap.append(overlap)
                if gene in nc_markers:
                    nc_overlap.append(overlap)
        
        # Calculate statistics
        if len(all_areas) > 0:
            avg_area_pixels = np.mean(all_areas)
            avg_area_um2 = avg_area_pixels * (pixel_size ** 2)
            # Approximate radius assuming circular shape: area = π * r^2
            avg_radius_um = np.sqrt(avg_area_um2 / np.pi)
        else:
            avg_area_pixels = 0
            avg_area_um2 = 0
            avg_radius_um = 0
        
        # Calculate extrasomatic fractions (1 - overlap fraction)
        if len(all_transcripts_overlap) > 0:
            extrasomatic_all = 1 - np.mean(all_transcripts_overlap)
        else:
            extrasomatic_all = 0
        
        if len(granule_overlap) > 0:
            extrasomatic_granule = 1 - np.mean(granule_overlap)
        else:
            extrasomatic_granule = 0
        
        if len(nc_overlap) > 0:
            extrasomatic_nc = 1 - np.mean(nc_overlap)
        else:
            extrasomatic_nc = 0
        
        # Store results
        results.append({
            'kernel_size': kernel_size,
            'iterations': iterations,
            'avg_area_pixels': avg_area_pixels,
            'avg_area_um2': avg_area_um2,
            'avg_radius_um': avg_radius_um,
            'extrasomatic_all_genes': extrasomatic_all,
            'extrasomatic_granule_markers': extrasomatic_granule,
            'extrasomatic_negative_controls': extrasomatic_nc,
            'num_nuclei': len(all_areas),
            'num_transcripts_all': len(all_transcripts_overlap),
            'num_transcripts_granule': len(granule_overlap),
            'num_transcripts_nc': len(nc_overlap)
        })
        
        print(f"  Average area: {avg_area_pixels:.1f} pixels² ({avg_area_um2:.2f} μm²), radius: {avg_radius_um:.2f} μm")
        print(f"  Extrasomatic fractions: All={extrasomatic_all:.2%}, Granule={extrasomatic_granule:.2%}, NC={extrasomatic_nc:.2%}\n")

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("BENCHMARK RESULTS SUMMARY")
print("="*80)
print(results_df.to_string(index=False))
print("\n" + "="*80)

# Save results to CSV
results_df.to_csv(os.path.join(data_path, "intermediate_data", "dilation_benchmark_results.csv"), index=False)
print(f"\nResults saved to: {os.path.join(data_path, 'intermediate_data', 'dilation_benchmark_results.csv')}")