In [None]:
#Matching MatLab based segmentation processing with Feldman output

import pandas as pd
import numpy as np
import re


# Define the file paths in your Google Drive (update these paths accordingly)
geno_df = pd.read_csv("C:/Users/syede/OneDrive - IRCM/UdeM/Results/OPS Jun 2025/Feldman output/Genotype.csv")
pheno_df = pd.read_csv("C:/Users/syede/OneDrive - IRCM/UdeM/Results/OPS Jun 2025/Feldman output/Phenotype_Quantification.csv")
output_path    = "C:/Users/syede/OneDrive - IRCM/UdeM/Results/OPS Jun 2025/Feldman output/output/Matched_Phenotype_Genotype.csv"

# --- Step 0: Add unique identifiers ---
# These will help ensure one-to-one matching later on.
geno_df['geno_id'] = geno_df.index
pheno_df['pheno_id'] = pheno_df.index

# --- Step 1: Rename coordinate columns to common names ---
# For genotype, rename 'Cell I' and 'Cell J' to 'x' and 'y'
geno_df = geno_df.rename(columns={"i_cell": "y", "j_cell": "x"})
# For phenotype, rename 'Location Centre X' and 'Location Centre Y' to 'x' and 'y'
pheno_df = pheno_df.rename(columns={"CellCentroidX": "x", "CellCentroidY": "y"})

# --- Step 2: Extract the tile number from TileIndex ---
pheno_df['tile'] = pheno_df['TileIndex']

# --- Step 3: Identify unique tiles to process ---
# We process only tiles present in both datasets.
unique_tiles = sorted(set(geno_df['tile']).intersection(set(pheno_df['tile'])))

tol = 5  # Tolerance in pixels
final_matches_list = []

# --- Step 4: Process each tile separately ---
for tile in unique_tiles:
    # Filter data for the current tile
    geno_tile = geno_df[geno_df['tile'] == tile].copy()
    pheno_tile = pheno_df[pheno_df['tile'] == tile].copy()
    
    # Skip if no data in one of the files
    if geno_tile.empty or pheno_tile.empty:
        continue

    # Merge on tile: perform a cross-join for the current tile
    merged_tile = pd.merge(geno_tile, pheno_tile, on="tile", suffixes=('_geno', '_pheno'))

    # Identify candidate pairs within tolerance for both x and y coordinates
    candidates = merged_tile[
        (abs(merged_tile['x_geno'] - merged_tile['x_pheno']) <= tol) &
        (abs(merged_tile['y_geno'] - merged_tile['y_pheno']) <= tol)
    ].copy()
    
    # If no candidates are found, skip this tile.
    if candidates.empty:
        continue

    # Compute Euclidean distance between the genotype and phenotype coordinates.
    candidates['diff'] = np.sqrt((candidates['x_geno'] - candidates['x_pheno'])**2 +
                                 (candidates['y_geno'] - candidates['y_pheno'])**2)
    
    # Sort candidates by the smallest difference first.
    candidates = candidates.sort_values('diff')

    # Greedy assignment for one-to-one matching.
    matched_geno_ids = set()
    matched_pheno_ids = set()
    tile_matches = []
    
    for _, row in candidates.iterrows():
        if (row['geno_id'] not in matched_geno_ids) and (row['pheno_id'] not in matched_pheno_ids):
            tile_matches.append(row)
            matched_geno_ids.add(row['geno_id'])
            matched_pheno_ids.add(row['pheno_id'])
    
    if tile_matches:
        final_matches_list.append(pd.DataFrame(tile_matches))

# --- Step 5: Concatenate the matches from all tiles ---
if final_matches_list:
    final_matches_df = pd.concat(final_matches_list, ignore_index=True)
else:
    final_matches_df = pd.DataFrame()

# Display the first few rows of the final merged output.
print(final_matches_df.head())
print(f"Total matched rows: {len(final_matches_df)}")

# Save final matched data
final_matches_df.to_csv(output_path, index=False)
print(f"Saved matched data to: {output_path}")
