# Configure Merge Module Params

This notebook should be used as a test for ensuring correct merge parameters before merge processing.
Cells marked with <font color='red'>SET PARAMETERS</font> contain crucial variables that need to be set according to your specific experimental setup and data organization.
Please review and modify these variables as needed before proceeding with the analysis.

## <font color='red'>SET PARAMETERS</font>

### Fixed parameters for merge processing

- `CONFIG_FILE_PATH`: Path to a Brieflow config file used during processing. Absolute or relative to where workflows are run from.

In [None]:
CONFIG_FILE_PATH = "config/config.yml"

In [None]:
import warnings
from pathlib import Path
import yaml
import pandas as pd

from lib.shared.file_utils import get_filename
from lib.shared.configuration_utils import CONFIG_FILE_HEADER, convert_tuples_to_lists
from lib.merge.merge_utils import (
    plot_combined_tile_grid,
    plot_merge_example,
    preview_mask_transformations,
    align_metadata,
    find_closest_tiles,
    fast_merge_example,
)
from lib.merge.hash import hash_cell_locations, initial_alignment
from lib.merge.eval_alignment import plot_alignment_quality

## <font color='red'>SET PARAMETERS</font>

### Determine merge plate-well combos
- `MERGE_COMBO_DF_FP`: Plate used for testing configuration

In [None]:
MERGE_COMBO_DF_FP = "config/merge_combo.tsv"

In [None]:
# load config file and determine root path
with open(CONFIG_FILE_PATH, "r") as config_file:
    config = yaml.safe_load(config_file)

SBS_COMBO_FP = Path(config["preprocess"]["sbs_combo_fp"])
sbs_wildcard_combos = pd.read_csv(SBS_COMBO_FP, sep="\t")
PHENOTYPE_COMBO_FP = Path(config["preprocess"]["phenotype_combo_fp"])
phenotype_wildcard_combos = pd.read_csv(PHENOTYPE_COMBO_FP, sep="\t")

# Generate plate-well combinations for merge
sbs_combos = set(zip(sbs_wildcard_combos["plate"], sbs_wildcard_combos["well"]))
phenotype_combos = set(
    zip(phenotype_wildcard_combos["plate"], phenotype_wildcard_combos["well"])
)
# Check if SBS and PHENOTYPE have the same plate-well combinations
if sbs_combos == phenotype_combos:
    merge_wildcard_combos = pd.DataFrame(list(sbs_combos), columns=["plate", "well"])
else:
    warnings.warn(
        "SBS and PHENOTYPE do not have matching plate-well combinations. Merging requires identical sets."
    )
    merge_wildcard_combos = pd.DataFrame(columns=["plate", "well"])

merge_wildcard_combos.to_csv(MERGE_COMBO_DF_FP, sep="\t", index=False)
merge_wildcard_combos

## <font color='red'>SET PARAMETERS</font>

### Parameters for testing merge module
- `TEST_PLATE`: Plate used for testing configuration 
- `TEST_WELL`: Well identifier used for testing configuration 

### Parameters for metadata extraction
- `SBS_METADATA_CYCLE`: Cycle number for extracting SBS data positions from the combined metadata file
- `SBS_METADATA_CHANNEL`: Optional channel filter for SBS metadata. Use this to filter the combined metadata file to a specific channel when multiple channels were acquired. If not specified, metadata will be automatically deduplicated by plate, well, and tile.
- `PH_METADATA_CHANNEL`: Optional channel filter for phenotype metadata. Use this to filter the combined metadata file to a specific channel when multiple channels were acquired. If not specified, metadata will be automatically deduplicated by plate, well, and tile.

In [None]:
TEST_PLATE = None
TEST_WELL = None

SBS_METADATA_CYCLE = 1
SBS_METADATA_CHANNEL = None
PH_METADATA_CHANNEL = None

In [None]:
ROOT_FP = Path(config["all"]["root_fp"])

# load phenotype and SBS metadata dfs
ph_filename_params = {"plate": TEST_PLATE, "well": TEST_WELL}
if PH_METADATA_CHANNEL is not None:
    ph_filename_params["channel"] = PH_METADATA_CHANNEL

ph_test_metadata_fp = (
    ROOT_FP
    / "preprocess"
    / "metadata"
    / "phenotype"
    / get_filename(ph_filename_params, "combined_metadata", "parquet")
)
ph_test_metadata = pd.read_parquet(ph_test_metadata_fp)

# Apply phenotype channel filtering if specified, otherwise deduplicate
if PH_METADATA_CHANNEL is not None:
    ph_test_metadata = ph_test_metadata[ph_test_metadata["channel"] == PH_METADATA_CHANNEL]
else:
    ph_test_metadata = ph_test_metadata.drop_duplicates(subset=["plate", "well", "tile"])

sbs_filename_params = {"plate": TEST_PLATE, "well": TEST_WELL}
if SBS_METADATA_CHANNEL is not None:
    sbs_filename_params["channel"] = SBS_METADATA_CHANNEL

sbs_test_metadata_fp = (
    ROOT_FP
    / "preprocess"
    / "metadata"
    / "sbs"
    / get_filename(sbs_filename_params, "combined_metadata", "parquet")
)
sbs_test_metadata = pd.read_parquet(sbs_test_metadata_fp)

# Apply SBS filtering - always filter by cycle, optionally by channel, otherwise deduplicate
sbs_test_metadata = sbs_test_metadata[sbs_test_metadata["cycle"] == SBS_METADATA_CYCLE]
if SBS_METADATA_CHANNEL is not None:
    sbs_test_metadata = sbs_test_metadata[sbs_test_metadata["channel"] == SBS_METADATA_CHANNEL]
else:
    # Only deduplicate if no channel filter was applied (cycle filter was already applied)
    sbs_test_metadata = sbs_test_metadata.drop_duplicates(subset=["plate", "well", "tile"])

# Derive phenotype alignment hash
phenotype_info_fp = (
    ROOT_FP
    / "phenotype"
    / "parquets"
    / get_filename(
        {"plate": TEST_PLATE, "well": TEST_WELL}, "phenotype_info", "parquet"
    )
)
phenotype_info = pd.read_parquet(phenotype_info_fp)

# Derive SBS alignment hash
sbs_info_fp = (
    ROOT_FP
    / "sbs"
    / "parquets"
    / get_filename({"plate": TEST_PLATE, "well": TEST_WELL}, "sbs_info", "parquet")
)
sbs_info = pd.read_parquet(sbs_info_fp)

# create plot with combined tile view
combined_tile_grid = plot_combined_tile_grid(ph_test_metadata, sbs_test_metadata)
combined_tile_grid.show()

## <font color='red'>SET PARAMETERS</font>

### Parameters for metadata alignment
Each microscope handles global coordinates differently. If datasets were acquired in two different microscopes the metadata of the wells needs to be aligned.

`METADATA_ALIGN`: Whether to perform metadata alignment. Defaults `False`.

`ALIGNMENT_FLIP_X`: Flip images left-to-right (horizontal flip). Defaults `False`.

`ALIGNMENT_FLIP_Y`: Flip images up-down (vertical flip). Defaults `False`.

`ALIGNMENT_ROTATE_90`: Whether to rotate 90 degrees counterclockwise. Defaults `False`.

In [None]:
METADATA_ALIGN = False
ALIGNMENT_FLIP_X = False
ALIGNMENT_FLIP_Y = False
ALIGNMENT_ROTATE_90 = False

In [None]:
# Apply flip and rotate transformation
if METADATA_ALIGN:
    sbs_aligned, ph_aligned, transform_info = align_metadata(
        sbs_test_metadata, ph_test_metadata, 
        flip_x=ALIGNMENT_FLIP_X,     # Flip x coordinates (horizontal flip)
        flip_y=ALIGNMENT_FLIP_Y,    # Flip y coordinates (vertical flip)
        rotate_90=ALIGNMENT_ROTATE_90,  # Rotation
    )
    # Check the result with your combined tile grid
    combined_tile_grid = plot_combined_tile_grid(ph_aligned, sbs_aligned)
    combined_tile_grid.show()
else:
    sbs_aligned = sbs_test_metadata
    ph_aligned = ph_test_metadata

## <font color='red'>SET PARAMETERS</font>

### Parameters for testing merge processing

- `SBS_TEST_SITES`: SBS images (sites) used to compute suggested matches for `INITIAL_SITES`. We recommend using 6 sites distributed across the plate.

- `INITIAL_SITES`: Combinations of phenotype and SBS tiles used for configuring merge module parameters. Based on the best matches calculated below, set a list of `[tile, site]` pairs. We will load images for those sites to ensure that we can visualize cell patterns that correspond between two tiles that will make up our initial sites.
  - Ex. `[[1, 0], [231, 11], [560, 25], [943, 49], [1123, 53], [1357, 67]]`. 
 

In [None]:
# Define test tiles
SBS_TEST_SITES = None

# Test all tiles and collect best matches
initial_sites = []
for tile_id in SBS_TEST_SITES:
    closest = find_closest_tiles(sbs_aligned, ph_aligned, tile_id)
    best_match = int(closest.iloc[0]['tile']) 
    initial_sites.append([best_match, tile_id])

# Print copy-pasteable output
print("\n" + "="*50)
print("Best matches:")
print("="*50)
print(f"INITIAL_SITES = {initial_sites}")

In [None]:
INITIAL_SITES = None

In [None]:
# Derive sites for phenotype and sbs
phenotype_tiles = [site[0] for site in INITIAL_SITES]
sbs_tiles = [site[1] for site in INITIAL_SITES]

# Derive phenotype alignment hash
phenotype_info_fp = (
    ROOT_FP
    / "phenotype"
    / "parquets"
    / get_filename(
        {"plate": TEST_PLATE, "well": TEST_WELL}, "phenotype_info", "parquet"
    )
)
phenotype_info = pd.read_parquet(phenotype_info_fp)
phenotype_info_hash = hash_cell_locations(phenotype_info)

# Derive SBS alignment hash
sbs_info_fp = (
    ROOT_FP
    / "sbs"
    / "parquets"
    / get_filename({"plate": TEST_PLATE, "well": TEST_WELL}, "sbs_info", "parquet")
)
sbs_info = pd.read_parquet(sbs_info_fp)
sbs_info_hash = hash_cell_locations(sbs_info).rename(columns={"tile": "site"})

# Perform alignment for initial sites
initial_alignment_df = initial_alignment(
    phenotype_info_hash, sbs_info_hash, initial_sites=INITIAL_SITES
)
initial_alignment_df

## <font color='red'>SET PARAMETERS</font>

### Visualize gating strategy based on initial alignment

- `DET_RANGE`: Enforces valid transformation ratios between phenotype and SBS images.
  - **Using acquisition parameters (same microscope):**
    - Calculate magnification ratio: `M = Mag_phenotype / Mag_sbs` (e.g., 20X/10X = 2)
    - Calculate binning ratio: `B = Bin_sbs / Bin_phenotype` (e.g., 2x2/1x1 = 2)
    - Total difference factor = M × B
    - `DET_RANGE = [0.96/(M×B)², 1.04/(M×B)²]`
    - Example:
      - With 2× magnification difference (20X/10X) and 2× binning difference (2x2 vs 1x1)
      - Total difference factor = 2 × 2 = 4
      - `DET_RANGE = [0.96/16, 1.04/16] = [0.06, 0.065]`
  - **Alternative - Using calibration values (different microscopes):**
    - Get µm/pixel calibration from each image's metadata
    - `det = (cal_phenotype / cal_sbs)²`
    - `DET_RANGE = [det × 0.96, det × 1.04]`
  - **Tolerance values (0.96 and 1.04):**
    - These represent ±4% tolerance around the expected determinant
    - Accounts for small optical distortions, stage positioning errors, and variation in actual vs nominal magnification
    - Can be widened if alignments are being rejected (e.g., 0.9 to 1.1 for ±10% tolerance)
    - Tighter tolerances reject more false matches; wider tolerances accept more valid edge cases
  - Note: Image tile size does not affect the determinant calculation
- `SCORE`: Minimum alignment quality score, typically 0.1

In [None]:
DET_RANGE = None
SCORE = None

In [None]:
plot_alignment_quality(
    initial_alignment_df, det_range=DET_RANGE, score=SCORE, xlim=(0, 0.1), ylim=(0, 1)
)

## <font color='red'>SET PARAMETERS</font>

### Visualize cell matches based on initial alignment

- `THRESHOLD`: Determines the maximum euclidean distance between a phenotype point and its matched SBS point for them to be considered a valid match

In [None]:
THRESHOLD = None

In [None]:
for ph_tile, sbs_site in INITIAL_SITES:
    success = fast_merge_example(
        ph_tile, sbs_site, initial_alignment_df, 
        phenotype_info, sbs_info, THRESHOLD
    )
    if not success:
        print(f"  Try a different tile-site combination or proceed to stitch approach.")

## <font color='red'>SET PARAMETERS (OPTIONAL): STITCH APPROACH</font>

### Parameters for image stitching

If no successful initial sites can be configured or results show poor performance, try the stitch-based merge approach.

`STITCH`: Whether to merge using stitch approach. This approach stitches the images into wells before performing alignment, merge, and deduplication.

`MASK_TYPE`: Type of object to align.
- `"nuclei"` uses segmented nuclei masks.
- `"cells"` uses segmented cell masks.

### Parameters for image orientation
Each microscope handles individual tile coordinates differently for stitching. Adjust the following parameters until you obtain a images that look right.

`FLIPUD`: Flip images upside-down (vertical flip). Defaults `False`.

`FLIPLR`: Flip images left-to-right (horizontal flip). Defaults `False`.

`ROT90`: Number of 90° rotations to apply to the image. For example, ROT90_K = 1 rotates the image 90° clockwise, ROT90_K = 2 rotates 180°, and so on.

`NUM_TILES_PHENO` & `NUM_TILES_SBS`: For testing purposes, number of tiles to display. Higher numbers may increase processing time but allow a larger view of the well.

**Eval Options:**
- `STITCHED_IMAGE`: Determines whether a stitched image will be produced for qc. **Note:** Setting this to True will significantly increase processing time but it is recommended on the first run.

In [None]:
STITCH = False
MASK_TYPE = "nuclei"
FLIPUD = False
FLIPLR = False
ROT90 = 0
STITCHED_IMAGE = False

In [None]:
NUM_TILES_PHENO = None

print("Testing phenotype data:")
ph_params = preview_mask_transformations(
    ph_test_metadata,
    ROOT_FP,
    "phenotype",
    mask_type=MASK_TYPE,
    num_tiles=NUM_TILES_PHENO,
    flipud=FLIPUD,
    fliplr=FLIPLR,
    rot90=ROT90
)

In [None]:
NUM_TILES_SBS = None

print("\nTesting SBS data with same transformation:")
sbs_params = preview_mask_transformations(
    sbs_test_metadata,
    ROOT_FP, 
    "sbs",
    mask_type=MASK_TYPE,
    num_tiles=NUM_TILES_SBS,
    flipud=FLIPUD,
    fliplr=FLIPLR,
    rot90=ROT90
)


### Set pixel size (optional)
Coordinate-based stitching converts stage coordinates (in micrometers) to pixel coordinates. If pixel size is not available in your image metadata you will have to set it manually below.

`SBS_PIXEL_SIZE`: Pixel size (in μm/pixel) of SBS images.
`PHENOTYPE_PIXEL_SIZE`: Pixel size (in μm/pixel) of phenotyping images.

In [None]:
# For SBS
if 'pixel_size_x' in sbs_test_metadata.columns:
    SBS_PIXEL_SIZE = sbs_test_metadata['pixel_size_x'].iloc[0]
    print(f"SBS pixel size found in metadata: {SBS_PIXEL_SIZE:.6f} μm/pixel")
else:
    print("No pixel_size_x found in SBS metadata.")
    # Check what columns are available
    print(f"SBS columns: {list(sbs_test_metadata.columns)}")

# For Phenotype  
if 'pixel_size_x' in ph_test_metadata.columns:
    PHENOTYPE_PIXEL_SIZE = ph_test_metadata['pixel_size_x'].iloc[0]
    print(f"Phenotype pixel size found in metadata: {PHENOTYPE_PIXEL_SIZE:.6f} μm/pixel")
else:
    print("No pixel_size_x found in phenotype metadata.")
    # Check what columns are available
    print(f"\nPhenotype columns: {list(ph_test_metadata.columns)}")

In [None]:
SBS_PIXEL_SIZE = None
PHENOTYPE_PIXEL_SIZE = None

## <font color='red'>SET PARAMETERS</font>

`SBS_DEDUP_PRIOR` & `PHENO_DEDUP_PRIOR`: Control how duplicate cell mappings are resolved through two sequential steps:

- Step 1: For each phenotype cell with multiple SBS matches, keeps the best SBS match
- Step 2: For each remaining SBS cell with multiple phenotype matches, keeps the best phenotype match

Each parameter is a `{"key": value}` dictionary where:

- **Keys**: Column names to sort by (e.g., distance, mapped_single_gene, fov_distance_0).
- **Values**: Sort direction (True = ascending, False = descending).
- **Order matters:** First column has highest priority, subsequent columns break ties.



**Example strategies:**
- `SBS_DEDUP_PRIOR = {"distance": True, "mapped_single_gene": False}`: Prioritize spatial accuracy first, then gene mapping quality.
- `SBS_DEDUP_PRIOR = {"mapped_single_gene": False, "distance": True}`: Prioritize single-gene assignments first, then spatial proximity.
- `PHENO_DEDUP_PRIOR = {"distance": True, "fov_distance_0": True}`: Prefer close phenotype matches near field-of-view center.

In [None]:
# Step 1: Deduplication of SBS matches
SBS_DEDUP_PRIOR = None
# Step 2: Deduplication of phenotype matches
PHENO_DEDUP_PRIOR = None

## Add merge parameters to config file

In [None]:
# Add merge section with common parameters
config["merge"] = {
    "approach": "stitch" if STITCH else "fast",
    "merge_combo_fp": MERGE_COMBO_DF_FP,
    "sbs_metadata_cycle": SBS_METADATA_CYCLE,
    "score": SCORE,
    "threshold": THRESHOLD,
    "sbs_metadata_channel": SBS_METADATA_CHANNEL,
    "ph_metadata_channel": PH_METADATA_CHANNEL,
    "alignment_flip_x": ALIGNMENT_FLIP_X,
    "alignment_flip_y": ALIGNMENT_FLIP_Y,
    "alignment_rotate_90": ALIGNMENT_ROTATE_90,
    "sbs_dedup_prior": SBS_DEDUP_PRIOR,
    "pheno_dedup_prior": PHENO_DEDUP_PRIOR,
}

# Add approach-specific parameters
if STITCH:
    config["merge"].update({
        "stitched_image": STITCHED_IMAGE,
        "flipud": FLIPUD,
        "fliplr": FLIPLR,
        "rot90": ROT90,
        "sbs_pixel_size": SBS_PIXEL_SIZE,
        "phenotype_pixel_size": PHENOTYPE_PIXEL_SIZE,
    })
else:
    config["merge"].update({
        "initial_sites": INITIAL_SITES,
        "det_range": DET_RANGE,
    })

# Convert tuples to lists before dumping
safe_config = convert_tuples_to_lists(config)

# Write the updated configuration back with markdown-style comments
with open(CONFIG_FILE_PATH, "w") as config_file:
    config_file.write(CONFIG_FILE_HEADER)
    yaml.dump(safe_config, config_file, default_flow_style=False, sort_keys=False)