In [None]:
import pandas as pd
import os
import numpy as np

# CUSTOM 1
root_path = r"/path/to/data/dir"
file_name = r"combined_cells.csv"


In [None]:
import math
import os
import warnings
from tqdm import tqdm

warnings.simplefilter("ignore")

root_path = os.path.join(root_path, "new_data")


Code for calculating cell to nearest anchor cell distances and cell-cell links.

In [None]:
CELL_OFFSETS = [
    [-1, -1],
    [-1, 0],
    [-1, 1],
    [1, -1],
    [1, 0],
    [1, 1],
    [0, -1],
    [0, 0],
    [0, 1]
]

def squared_distance_3d(a, b):
    dx = a[0] - b[0]
    dy = a[1] - b[1]
    dz = a[2] - b[2]
    return dx * dx + dy * dy + dz * dz

def get_closest(sources, source_indexes, targets, max_dist_squared):
    for index, source in enumerate(sources):
        min_dist = max_dist_squared
        closest = None
        for target in targets:
            dist_squared = squared_distance_3d(source, target)
            if dist_squared < min_dist:
                min_dist = dist_squared
                closest = target
        if closest:
            yield [source_indexes[index]] + source + closest

def add_to_cell(node, cells):
    cx = cells.setdefault(node['cell'][0], {})
    cy = cx.setdefault(node['cell'][1], {'nodes': [], 'positions': []})
    cy['nodes'].append(node['__index__'])
    cy['positions'].append(node['position'])
    

def distance_edges(nodes, type_field, target_type, max_dist):
    source_cells = {}
    target_cells = {}

    for node_index, node in enumerate(nodes):
        node['__index__'] = node_index
        node['position'] = [node.get('x', 0), node.get('y', 0), node.get('z', 0)]
        node['cell'] = [int(node['x'] / max_dist), int(node['y'] / max_dist)]

        if node[type_field] == target_type:
            add_to_cell(node, target_cells)
        else:
            add_to_cell(node, source_cells)

    max_dist_squared = max_dist * max_dist
    for source_cell_x, source_cell_y_dict in source_cells.items():
        for source_cell_y, sources in source_cell_y_dict.items():
            all_targets = []
            for offset_x, offset_y in CELL_OFFSETS:
                cell_x = int(source_cell_x) + offset_x
                cell_y = int(source_cell_y) + offset_y
                targets = target_cells.get(cell_x, {}).get(cell_y, None)
                if targets:
                    all_targets.extend(targets['positions'])
            if all_targets:
                yield from get_closest(sources['positions'], sources['nodes'], all_targets, max_dist_squared)

def calculate_nearest_endothelial_cell(nodes, type_field="Cell Type", target_type="Endothelial", max_dist=1000):
    # nodes, type_field, target_type, max_dist = msg['data']['nodes'], msg['data']['type_field'], msg['data']['target_type'], msg['data']['maxDist']
    edges = [None] * len(nodes)
    index = 0
    report_step = len(nodes) // 10
    for edge in distance_edges(nodes, type_field, target_type, max_dist):
        edges[index] = edge
        if index % report_step == 0:
            percentage = round((index / len(nodes)) * 100)
            print({'status': 'processing', 'percentage': percentage, 'node_index': edge[0]})
        index += 1
    print({'status': 'complete', 'percentage': 100})
    return edges[:index]


In [None]:
# Create a dir outputs even if it already exists.
os.makedirs(os.path.join(root_path, "outputs"), exist_ok=True)

In [None]:
unique_regions = os.listdir(root_path)                          
unique_regions = [x for x in unique_regions if x.endswith(".csv")]

for label in unique_regions:
    # Get the path to respective dataset
    path = os.path.join(root_path, label)
    df_Region_1 = pd.read_csv(path)

    # Get coordinates x, y, Cell subtype and cellType (2D Data)
    df_Region_1 = df_Region_1[["x", "y", "Cell Type"]]

    # Calculate μm per px
    # Scale coordinates from pixel to micro meter 
    micro_per_pixel = 1.0  # Assuming coordinates are already in micro meter
    scale = micro_per_pixel  # to convert given pixel in micro meter unit
    df_Region_1["x"] = scale * df_Region_1["x"]
    df_Region_1["y"] = scale * df_Region_1["y"]

    # Create two data frames each for endothelial cells and all other cells
    df_Region_1_vessel = df_Region_1.loc[df_Region_1["Cell Type"] == "Endothelial"]
    df_Region_1_immmune = df_Region_1.loc[df_Region_1["Cell Type"] != "Endothelial"]

    # # Define list variables to store
    x_list = []
    y_list = []
    xv_list = []
    yv_list = []
    new_x = []
    new_y = []
    # new_dist = []

    # Storing the scaled values
    x_list = df_Region_1_immmune["x"].values.tolist()
    y_list = df_Region_1_immmune["y"].values.tolist()
    ct_list = df_Region_1_immmune["Cell Type"].values.tolist()
    xv_list = df_Region_1_vessel["x"].values.tolist()
    yv_list = df_Region_1_vessel["y"].values.tolist()

    print(len(x_list), len(y_list))
    print(len(xv_list), len(yv_list))
    # temp_x = 0
    # temp_y = 0
    nodes = df_Region_1.to_dict(orient='records')
    # print nodes to a txt file
    # with open("nodes.txt", "w") as f:
    #     f.write(str(nodes))
    print(len(nodes))
    # print(nodes)
    # new_x, new_y, new_dist = calculate_nearest_endothelial_cell(x_list, y_list, xv_list, yv_list)
    edges = calculate_nearest_endothelial_cell(nodes, type_field="Cell Type", target_type="Endothelial", max_dist=1000)

    print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
    print(len(edges))

     # Convert edges into a dataframe with columns cell_id, x, y, z, xv, yv, zv
    edges_df = pd.DataFrame(edges, columns=["cell_id", "x", "y", "z", "xv", "yv", "zv"])

    # Assign column names to edges_df
    edges_df.columns = ['cell_id', 'x', 'y', 'z', 'xv', 'yv', 'zv']

    nodes_df = df_Region_1.copy()

    # Add a new 'cell_id' column to nodes_df
    nodes_df['cell_id'] = range(len(nodes_df))

    # Set 'cell_id' column as index for nodes_df
    nodes_df.set_index('cell_id', inplace=True)

    # Merge nodes_df with edges_df based on the index (cell_id) with a left join
    edges_df_new = pd.merge(edges_df, nodes_df[['Cell Type']], how='left', left_on='cell_id', right_index=True)

    # print how many rows exist in nodes, edges, and edges_df_new
    print(f"Number of rows in nodes_df: {len(nodes_df)}")
    print(f"Number of rows in edges_df: {len(edges_df)}")
    print(f"Number of rows in edges_df_new: {len(edges_df_new)}")

    # print how many unique values exist in 'cell_id' column of edges_df_new
    print(f"Number of unique values in 'cell_id' column of edges_df_new: {edges_df_new['cell_id'].nunique()}")
    print(f"Number of unique values in 'cell_id' column of edges_df: {edges_df['cell_id'].nunique()}")
    print(f"Number of unique values in 'cell_id' column of nodes_df: {nodes_df.index.nunique()}")

    # print how many unique values exist in 'Cell Type' column of nodes_df
    print(f"Number of unique values in 'Cell Type' column of nodes_df: {nodes_df['Cell Type'].nunique()}")
    print(f"Number of unique values in 'Cell Type' column of edges_df_new: {edges_df_new['Cell Type'].nunique()}")

    # Save the data frame to csv file for vitessce
    df = pd.DataFrame(
        {
            "x": xv_list,
            "y": yv_list,
        }
    )

    # remove .csv from label
    label = label[:-4]

    df_Region_1.to_csv(
        os.path.join(root_path, "outputs", f"{label}-nodes.csv"),
        index=False,
    )
    
    df.to_csv(
        os.path.join(root_path, "outputs", f"{label}-only-vessel-nodes.csv"),
        index=False,
    )

    df = pd.DataFrame(
        {
            "x": x_list,
            "y": y_list,
            "type": ct_list,
            "group": "Cell",
            
        }
    )
    df.to_csv(
        os.path.join(root_path, "outputs", f"{label}-only-non-vessel-nodes.csv"),
        index=False,
    )
    
    # Save sorted_edges as csv
    edges_df_new.to_csv(
        os.path.join(root_path, "outputs", f"{label}-edges.csv"),
        index=False,
    )

    print(f"Final files for {label} written successfully")

In [None]:
# import pandas as pd

# # Read nodes.csv and edges.csv
# nodes_df = pd.read_csv('/u/yashjain/vitessce-files/vccf-data-cell-nodes/unpublished/lymphnode-codex-yale/LN00837-nodes.csv')
# edges_df = pd.read_csv('/u/yashjain/vitessce-files/vccf-data-cell-nodes/unpublished/lymphnode-codex-yale/LN00837-edges.csv', header=None)  # Assuming no header in the file

# # Assign column names to edges_df
# edges_df.columns = ['cell_id', 'x', 'y', 'z', 'x_target', 'y_target', 'z_target']

# # Add a new 'cell_id' column to nodes_df
# nodes_df['cell_id'] = range(len(nodes_df))

# # Set 'cell_id' column as index for nodes_df
# nodes_df.set_index('cell_id', inplace=True)

# # Merge nodes_df with edges_df based on the index (cell_id) with a left join
# edges_df_new = pd.merge(edges_df, nodes_df[['Cell Type']], how='left', left_on='cell_id', right_index=True)

# #Write edges_df to a csv file in current directory
# edges_df_new.to_csv('edges_ct.csv')

# display(edges_df_new)

Merge nodes.csv and edges.csv file from CDE into a single dataframe, including distances and cell types for each node (cell).

In [None]:
data_status = "published"
dataset_name = "intestine-codex-stanford"
organ = "small-intestine"

data_dir = "outputs" # Folder containing all nodes and edges files

raw_filedir = os.path.join(root_path, data_dir, data_status, dataset_name, organ)

dfs = []
column_names = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2']
# Loop through all files in the directory
for filename in os.listdir(raw_filedir):
    if filename.endswith('.csv') and 'nodes' not in filename:
        edges_file_path = os.path.join(raw_filedir, filename)
        nodes_file_path = os.path.join(raw_filedir, filename.split('-')[0] + '-nodes.csv')
        # Read the CSV file into a DataFrame
        edges_df = pd.read_csv(edges_file_path, header=None, names=column_names)
        nodes_df = pd.read_csv(nodes_file_path)
        # Add a new column for the filename
        edges_df['filename'] = filename.split('-')[0]
        edges_df['unique_region'] = filename.split('-')[0].split('_')[1]
        # calculate distance between two points
        edges_df['distance'] = np.sqrt((edges_df['x1'] - edges_df['x2'])**2 + (edges_df['y1'] - edges_df['y2'])**2+ (edges_df['z1'] - edges_df['z2'])**2)
        
        # Add a new 'cell_id' column to nodes_df
        nodes_df['cell_id'] = range(len(nodes_df))

        # Set 'cell_id' column as index for nodes_df
        nodes_df.set_index('cell_id', inplace=True)

        # Merge nodes_df with edges_df based on the index (cell_id) with a left join
        edges_df_new = pd.merge(edges_df, nodes_df[['Cell Type']], how='left', left_on='cell_id', right_index=True)

        # Append the DataFrame to the list
        dfs.append(edges_df_new)

# Concatenate all DataFrames in the list into a single DataFrame
merged_df_si_nodes_edges_merged = pd.concat(dfs, ignore_index=True)

# Display the merged DataFrame
display(merged_df_si_nodes_edges_merged)