In [1]:
import pandas as pd
import os
import numpy as np
import math
import os
import warnings
from tqdm import tqdm

warnings.simplefilter("ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Exemplary nodes.csv file for Intestine Dataset from Stanford (John Hickey)
# https://cdn.humanatlas.io/image-store/vccf-data-cell-nodes/published/intestine-codex-stanford/B012_Duodenum-nodes.csv

# Exemplary edges.csv file for Intestine Dataset from Stanford (John Hickey)
# https://cdn.humanatlas.io/image-store/vccf-data-cell-nodes/published/intestine-codex-stanford/B012_Duodenum-edges.csv

In [3]:
# read nodes.csv file as pandas dataframe. Nodes are x,y,z coordinates of each cell centroid in the image.
nodes_df = pd.read_csv("example-data-for-epic/B012_Duodenum-nodes.csv")

# read edges.csv file as pandas dataframe. Edges are defined as links between each cell and the nearest endothelial cell (blood vessel).
column_names = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2']
edges_df = pd.read_csv("example-data-for-epic/B012_Duodenum-edges.csv", header=None, names=column_names)

In [4]:
display(nodes_df.head(2))

Unnamed: 0,x,y,Cell Type
0,1086.83,4894.632,Cycling TA
1,1251.37,5079.956,Cycling TA


In [5]:
display(edges_df.head(2))

Unnamed: 0,cell_id,x1,y1,z1,x2,y2,z2
0,2453,984.642,2986.834,0,1333.64,3116.734,0
1,66,627.85,3520.29,0,955.198,3406.844,0


Compute distance values (in micrometers) and merge nodes.csv and edges.csv file from CDE into a single dataframe, including distances and cell types for each node (cell).

In [6]:
# calculate distance between two points
edges_df['distance'] = np.sqrt((edges_df['x1'] - edges_df['x2'])**2 + (edges_df['y1'] - edges_df['y2'])**2+ (edges_df['z1'] - edges_df['z2'])**2)

# Add a new 'cell_id' column to nodes_df
nodes_df['cell_id'] = range(len(nodes_df))

# Set 'cell_id' column as index for nodes_df
nodes_df.set_index('cell_id', inplace=True)

# Merge nodes_df with edges_df based on the index (cell_id) with a left join
combined_df = pd.merge(edges_df, nodes_df[['Cell Type']], how='left', left_on='cell_id', right_index=True)

# Display the merged DataFrame
display(combined_df.head(2))

Unnamed: 0,cell_id,x1,y1,z1,x2,y2,z2,distance,Cell Type
0,2453,984.642,2986.834,0,1333.64,3116.734,0,372.389063,Enterocyte
1,66,627.85,3520.29,0,955.198,3406.844,0,346.448709,CD66+ Enterocyte


In [7]:
# Convert combined dataframe into EPIC (Linkage Adjacency) format.
# Columns: schema id, first object id, second object id, distance value, distance unit, is_touching(optional), obj source file, obj mask name, protocol.

# Create a new DataFrame to store the EPIC format. Not adding the is_touching column.
epic_df = pd.DataFrame(columns=['schema_id', 'first_object_id', 'second_object_id', 'distance_value', 'distance_unit', 'obj_source_file', 'obj_mask_name', 'protocol'])

In [8]:
# Iterate through each cell_id in combined_df.
for index, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    cell_id = row['cell_id']

    # Get the row index from nodes_df where x2, y2 values are equal to x, y values in combined_df.
    row_index = nodes_df[(nodes_df['x'] == row['x2']) & (nodes_df['y'] == row['y2'])].index[0]

    # Add the initial cell_id and the row index value with distance value to epic_df.
    epic_df = epic_df.append({'schema_id': "Dummy ID", 'first_object_id': cell_id, 'second_object_id': row_index, 'distance_value': row['distance'], 'distance_unit': 'micrometers', 'obj_source_file': 'B012_Duodenum-nodes.csv', 'obj_mask_name': 'B012_Duodenum', 'protocol': 'Linkage Adjacency Dummy Value'}, ignore_index=True)


100%|██████████| 28901/28901 [02:20<00:00, 205.92it/s]


In [9]:

# Display the EPIC format DataFrame
display(epic_df.head(2))


Unnamed: 0,schema_id,first_object_id,second_object_id,distance_value,distance_unit,obj_source_file,obj_mask_name,protocol
0,Dummy ID,2453,21224,372.389063,micrometers,B012_Duodenum-nodes.csv,B012_Duodenum,Linkage Adjacency Dummy Value
1,Dummy ID,66,19678,346.448709,micrometers,B012_Duodenum-nodes.csv,B012_Duodenum,Linkage Adjacency Dummy Value


In [10]:
# Save the EPIC format DataFrame to a CSV file.
epic_df.to_csv('example-data-for-epic/B012_Duodenum-epic-format.csv', index=False)

In [11]:
# Some code to test the EPIC format DataFrame.
# Read a random row from epic_df and get the corresponding two rows from nodes_df based on first_object_id and second_object_id, both of which correspond to row index in nodes_df.
row = epic_df.sample()
# print row values
display(row)

first_object_id = row['first_object_id'].values[0]
second_object_id = row['second_object_id'].values[0]
display(nodes_df.loc[first_object_id])
display(nodes_df.loc[second_object_id])

# display the row from combined_df where cell_id is equal to first_object_id.
display(combined_df[combined_df['cell_id'] == first_object_id])

# display the row from combined_df where cell_id is equal to second_object_id.
display(combined_df[combined_df['cell_id'] == second_object_id])


Unnamed: 0,schema_id,first_object_id,second_object_id,distance_value,distance_unit,obj_source_file,obj_mask_name,protocol
6567,Dummy ID,31126,19607,79.488234,micrometers,B012_Duodenum-nodes.csv,B012_Duodenum,Linkage Adjacency Dummy Value


x            1416.776
y             4174.12
Cell Type          DC
Name: 31126, dtype: object

x               1486.056
y                4135.15
Cell Type    Endothelial
Name: 19607, dtype: object

Unnamed: 0,cell_id,x1,y1,z1,x2,y2,z2,distance,Cell Type
6567,31126,1416.776,4174.12,0,1486.056,4135.15,0,79.488234,DC


Unnamed: 0,cell_id,x1,y1,z1,x2,y2,z2,distance,Cell Type
