In [7]:
import pandas as pd
import scipy.spatial.distance as sci_dist
from PIL import Image
import numpy as np
from scipy.spatial import ConvexHull
import matplotlib.pyplot as plt
from matplotlib.path import Path
import os
import sys

# tools_path  = '/Users/grantkinsler/Documents/Penn/Research/SpatialBarcodes/SpatialBarcodes/analysis/tools/tools.py'
tools_path = '../helperScripts/tools.py'
# tools_path  = '/Users/grantkinsler/Documents/Penn/Research/SpatialBarcodes/analysis/tools/tools.py'

# /Users/grantkinsler/Documents/Penn/Research/SpatialBarcodes/analysis/tools/tools.py
sys.path.append(os.path.dirname(os.path.expanduser(tools_path)))
import tools as tools

Image.MAX_IMAGE_PIXELS = None
np.random.seed(0)

roi_name = 'roi_2'
data_folder = tools.roi_path_names

data_folder = '/Users/yaelheyman/RajLab Dropbox/Yael Heyman/SpatialBarcodes/ImagingData/2024-02-27_spatialbarcodes_SG_expression/projects/2024-02-27_spatialbarcodes_expression/roi_2/exports'
region_data = '/Users/YaelHeyman/RajLab Dropbox/Yael Heyman/SpatialBarcodes/ImagingData/2024-02-27_spatialbarcodes_SG_expression/processedData/Region_Data_with_Minimal_Values.csv'

cell_by_gene_path = data_folder + '/cell_by_gene_matrix_dilate10_20240718_withbarcodes_clustering_10bcs_0.2thresh.csv'
transcripts_path = data_folder + '/decode_1_withcoordinates.csv'
coords_path = data_folder + '/segmentation_1_nuclei_cellattributes.csv'
output_folder = data_folder + '/processedData'

def calculate_polygon_area(vertices):
    if not np.array_equal(vertices[0], vertices[-1]):
        vertices = np.vstack([vertices, vertices[0]])
    x0 = vertices[:-1, 0]
    y1 = vertices[1:, 1]
    y0 = vertices[:-1, 1]
    x1 = vertices[1:, 0]
    sum1 = np.sum(x0 * y1)
    sum2 = np.sum(y0 * x1)
    area = abs(sum1 - sum2) / 2.0
    return area


def max_dist(coords, ids):
    distance = sci_dist.pdist(coords[ids, :])
    dist_mat = sci_dist.squareform(distance)
    max_val = np.max(dist_mat)
    area_enclosed = calculate_polygon_area(coords[ids, :])
    return max_val, area_enclosed


def expand_polygon(vertices, expansion_distance):
    centroid = np.mean(vertices, axis=0)
    expanded_vertices = []
    for vertex in vertices:
        direction = vertex - centroid
        norm = np.linalg.norm(direction)
        if norm != 0:
            direction /= norm
        expanded_vertex = vertex + direction * expansion_distance
        expanded_vertices.append(expanded_vertex)
    return np.array(expanded_vertices)


def in_poly(polygon_indices, all_points, r=300):
    polygon_points = all_points[polygon_indices]
    if len(polygon_indices) < 3:
        if len(polygon_indices) == 1:
            center = polygon_points[0]
            distances = np.linalg.norm(all_points - center, axis=1)
            inside_indices = np.where(distances <= r)[0]
        elif len(polygon_indices) == 2:
            center = np.mean(polygon_points, axis=0)
            distances = np.linalg.norm(all_points - center, axis=1)
            inside_indices = np.where(distances <= r)[0]
    else:
        hull = ConvexHull(polygon_points)
        hull_vertices = polygon_points[hull.vertices]
        expanded_vertices = expand_polygon(hull_vertices, r / 2)
        expanded_path = Path(expanded_vertices)
        inside_indices = [i for i, point in enumerate(all_points) if expanded_path.contains_point(point)]

    inside_indices = np.setdiff1d(inside_indices, polygon_indices)  # Ensure unique and correct indices
    return inside_indices


def calc_bc_overlap(inside_indices, ids,cell_by_gene):
    overlap_info = []
    ref_barcodes = set()
    for ref_id in ids:
        # Get the barcodes and clean them up
        barcodes_str = cell_by_gene['called_barcodes'].iloc[ref_id]
        
        # Ensure barcodes are properly split and added to the set
        clean_barcodes = [barcode.strip().strip("[]'") for barcode in barcodes_str]
        ref_barcodes.update(clean_barcodes)

    for i in inside_indices:
        target_barcodes = set(cell_by_gene['called_barcodes'].iloc[i])
        target_barcodes = set([barcode.strip().strip("[]'") for barcode in target_barcodes])
        intersection = ref_barcodes & target_barcodes
        overlap_percentage = 1  if len(intersection) == len(ref_barcodes) else 0
        overlap_info.append((overlap_percentage, len(intersection)))
    return overlap_info


In [8]:
pixel2um = 107.11 / 1000000  # Convert pixel to micrometers
combined_df = pd.DataFrame()
region_data = pd.read_csv(region_data)
for j in range(1,4):  # Adjust range as needed
    # Dynamically construct the path
    data_folder = f"/Users/yaelheyman/RajLab Dropbox/Yael Heyman/SpatialBarcodes/ImagingData/2024-02-27_spatialbarcodes_SG_expression/projects/2024-02-27_spatialbarcodes_expression/roi_{j}/exports"
    cell_by_gene_path = os.path.join(data_folder, 'cell_by_gene_matrix_dilate10_20240718_withbarcodes_clustering_10bcs_0.2thresh.csv')
    print(data_folder)
    # Load the cell by gene data
    cell_by_gene = pd.read_csv(cell_by_gene_path, index_col=0)
        # Get reference X and Y from region_data as scalar values
    ref_x = region_data.loc[region_data['Region'] == f"Region {j}", "Min X (mm)"].iloc[0]
    ref_y = region_data.loc[region_data['Region'] == f"Region {j}", "Min Y (mm)"].iloc[0]
    
    # Correct the coordinates
    cell_by_gene['center_x'] = (cell_by_gene['center_x'] * pixel2um + ref_x)*1000
    cell_by_gene['center_y'] = (cell_by_gene['center_y'] * pixel2um + ref_y)*1000
    # Adjust the cell_id to make them unique across regions
    cell_by_gene['cell_id'] = cell_by_gene['cell_id'] + j * 100000
    
    # Concatenate to the combined DataFrame
    combined_df = pd.concat([combined_df, cell_by_gene], ignore_index=True)

/Users/yaelheyman/RajLab Dropbox/Yael Heyman/SpatialBarcodes/ImagingData/2024-02-27_spatialbarcodes_SG_expression/projects/2024-02-27_spatialbarcodes_expression/roi_1/exports
/Users/yaelheyman/RajLab Dropbox/Yael Heyman/SpatialBarcodes/ImagingData/2024-02-27_spatialbarcodes_SG_expression/projects/2024-02-27_spatialbarcodes_expression/roi_2/exports
/Users/yaelheyman/RajLab Dropbox/Yael Heyman/SpatialBarcodes/ImagingData/2024-02-27_spatialbarcodes_SG_expression/projects/2024-02-27_spatialbarcodes_expression/roi_3/exports


In [9]:

coords = combined_df[['center_x', 'center_y']]
# transcripts = pd.read_csv(transcripts_path, index_col=0)

# Ensure correct indexing and handling of coordinates
# coords = coords.reindex(columns=['center_x', 'center_y'])
coords = coords.to_numpy()

# Convert called_barcodes to lists if not already
combined_df['called_barcodes'] = combined_df['called_barcodes'].apply(
    lambda x: x.split(',') if isinstance(x, str) else []
)




In [10]:


# Check the indices of the cell_by_gene DataFrame
print("cell_by_gene indices:", combined_df.index)

# Create a dictionary to hold the indices for each unique barcode
barcode_indices = {}
for index, barcode in enumerate(combined_df['barcode_names']):
    barcode_key = barcode if barcode == barcode else 'NaN'
    if barcode_key in barcode_indices:
        barcode_indices[barcode_key].append(index)
    else:
        barcode_indices[barcode_key] = [index]

indices_list = list(barcode_indices.values())
for ids in indices_list:
    print(combined_df['barcode_names'].iloc[ids] )

cell_by_gene indices: RangeIndex(start=0, stop=45951, step=1)
0        bc_006
721      bc_006
1157     bc_006
1977     bc_006
6503     bc_006
          ...  
42682    bc_006
43642    bc_006
44036    bc_006
44152    bc_006
44208    bc_006
Name: barcode_names, Length: 69, dtype: object
1        bc_076
578      bc_076
629      bc_076
927      bc_076
1025     bc_076
          ...  
40869    bc_076
43258    bc_076
43577    bc_076
43623    bc_076
43628    bc_076
Name: barcode_names, Length: 132, dtype: object
2    bc_019-bc_031-bc_044-bc_046-bc_054-bc_060-bc_0...
Name: barcode_names, dtype: object
3        bc_075
158      bc_075
312      bc_075
7539     bc_075
14195    bc_075
14735    bc_075
14736    bc_075
14775    bc_075
14795    bc_075
14806    bc_075
14926    bc_075
14973    bc_075
15062    bc_075
16029    bc_075
19634    bc_075
20860    bc_075
20912    bc_075
20962    bc_075
21038    bc_075
21437    bc_075
22880    bc_075
26904    bc_075
26921    bc_075
27078    bc_075
27145    bc_075
3

In [11]:
# Parameters
PLOT = False
r = 30  # Radius for in_poly function
# overlap_threshold = 3 # when testing for overlap between non sister cells close to sisterhood they should have over this number to be considered belonging to the sisterhood
thresholds = range(2, 11)  # Example range of threshold values to scan over
all_results = []

# Iterate over different threshold values
for thresh in thresholds:
    overlap_threshold = thresh
    all_overlap_info = []
    
    for i in range(len(combined_df)):
        if len(combined_df['called_barcodes'].iloc[i]) > thresh:
            inside_indices = in_poly(polygon_indices=[i], all_points=coords, r=r)
            overlap_info = calc_bc_overlap(inside_indices, [i], combined_df)
            all_overlap_info.extend(overlap_info)

            if PLOT:
                
                plt.figure()
                plt.scatter(coords[i, 0], coords[i, 1], c='b')
                plt.text(coords[i, 0], coords[i, 1], combined_df['barcode_names'].iloc[i], color='b', fontsize=8, ha='right', va='bottom')

                plt.scatter(coords[inside_indices, 0], coords[inside_indices, 1], c='g')
                for j, (percent, count) in zip(inside_indices, overlap_info):
                    plt.text(coords[j, 0], coords[j, 1], combined_df['barcode_names'].iloc[j], color='g', fontsize=8, ha='right', va='bottom')
                    plt.text(coords[j, 0], coords[j, 1] + 5, f"{percent:.1f}% ({count})", color='r', fontsize=8, ha='right', va='bottom')

                plt.tight_layout()
                plt.show()

                # print(combined_df['barcode_names'].iloc[ids])
                # print(combined_df[ 'barcode_names'].iloc[inside_indices])
                above_threshold = sum(1 for a, count in overlap_info if count > overlap_threshold and a==0)
                print(f"above_threshold {above_threshold}")


    # Calculate the fraction of counts above the current threshold
    above_threshold = sum(1 for a, count in all_overlap_info if count > overlap_threshold and a==0)
    total_counts = len(all_overlap_info)
    fraction_above_threshold = above_threshold / total_counts if total_counts > 0 else 0

    # Save the results for this threshold
    all_results.append((thresh, fraction_above_threshold))

    print(f"Threshold: {thresh}, Fraction of overlap counts above {overlap_threshold}: {fraction_above_threshold:.2f}")

# After the loop, you can save or further analyze `all_results` as needed.
# Example: Convert results to a DataFrame and save to CSV
results_df = pd.DataFrame(all_results, columns=['Threshold', 'FractionAboveThreshold'])
results_df.to_csv(output_folder + '/overlap_results_by_threshold_unclustered_all_rois_sc_algo.csv', index=False)

print("Results saved to overlap_results_by_threshold.csv")


Threshold: 2, Fraction of overlap counts above 2: 0.19
Threshold: 3, Fraction of overlap counts above 3: 0.16
Threshold: 4, Fraction of overlap counts above 4: 0.14
Threshold: 5, Fraction of overlap counts above 5: 0.11
Threshold: 6, Fraction of overlap counts above 6: 0.10
Threshold: 7, Fraction of overlap counts above 7: 0.09
Threshold: 8, Fraction of overlap counts above 8: 0.08
Threshold: 9, Fraction of overlap counts above 9: 0.07
Threshold: 10, Fraction of overlap counts above 10: 0.07
Results saved to overlap_results_by_threshold.csv
