# Data Processing: Cell Distance Computation

> Compute cell-to-nearest-endothelial-cell distance distributions for all datasets in `data-processed-nodes-with-harmonized-cell-types`.

In [1]:
import numpy as np
import pandas as pd
import os
import json
import requests
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
import plotly.express as px

from _cde_compute_edges_from_nodes import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# suppress warnings
import warnings
warnings.filterwarnings("ignore")   

In [3]:
basepath = "/u/yashjain/hra-cell-distance-analysis/data"
data_filedir = "data-processed-nodes-with-harmonized-cell-types"
output_edge_dir = "data-processed-edges"
figures_output_dir = "generated-figures"

In [4]:
# Function to load your data
def load_data(path, edges=False):
    if edges:
        column_names = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2']
        data = pd.read_csv(path, header=None, names=column_names)
    else:
        data = pd.read_csv(path)
    return data

In [4]:
# Function to read all files ending with "-nodes.csv" in the `data_filedir` directory into a single DataFrame. 
# Another additional column `Dataset` is added to identify the dataset name which comes from the filename befrore the `-nodes.csv` suffix.
def read_all_datasets(basepath, data_filedir):
	all_files = []
	for file in os.listdir(os.path.join(basepath, data_filedir)):
		if file.endswith("-nodes.csv"):
			file_path = os.path.join(basepath, data_filedir, file)
			dataset_name = file.replace("-nodes.csv", "")
			df = load_data(file_path)
			df['Dataset'] = dataset_name
			all_files.append(df)

	print(f"Total number of files read: {len(all_files)}")
	merged = pd.concat(all_files, ignore_index=True)
	print(f"Total number of cells in merged DataFrame: {len(merged)}")
	return merged

In [5]:
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

In [6]:
# Create destination directory. Overwrite if it exists.
if os.path.exists(os.path.join(basepath, output_edge_dir)):
    shutil.rmtree(os.path.join(basepath, output_edge_dir))
    print(f"Directory '{output_edge_dir}' already exists and has been removed. New directory will be created.")
else:
    print(f"Directory '{output_edge_dir}' does not exist and will be created.")
os.makedirs(os.path.join(basepath, output_edge_dir), exist_ok=False)

Directory 'data-processed-edges' already exists and has been removed. New directory will be created.


In [7]:
# Function to compute edges from nodes.
def compute_edges_from_nodes(df, anchor_cell, threshold, type_field="Cell Type", report_progress=False):
    # Reformat nodes data.
    nodes = df.to_dict(orient='records')
    edges = calculate_nearest_endothelial_cell(nodes, type_field=type_field, target_type=anchor_cell, max_dist=threshold, report_progress=report_progress)
    # Save edges as a csv with no headers.
    edges_df = pd.DataFrame(edges)
    return edges_df

## Iterate through all datasets and compute edges.

In [8]:
# Iterate through all directories in `data_filedir`.
for dataset_dir in os.listdir(os.path.join(basepath, data_filedir)):
    dataset_path = os.path.join(basepath, data_filedir, dataset_dir)
    output_edge_path = os.path.join(basepath, output_edge_dir, dataset_dir)
    if not os.path.isdir(dataset_path):
        continue
    print(f"Processing dataset: {dataset_dir}")

    # Create output directories even if they exist.
    os.makedirs(output_edge_path, exist_ok=False)

    # Read all datasets into a single DataFrame.
    df_all_data = read_all_datasets(basepath, os.path.join(data_filedir, dataset_dir))

    # Print the total number of unique cell types per dataset. Compute separately for each cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type).
    print("Total number of unique cell types per cell type annnotation level:")
    unique_cell_types = {
        'Original Cell Type': df_all_data['Original Cell Type'].nunique(),
        'Level Three Cell Type': df_all_data['Level Three Cell Type'].nunique(),
        'Level Two Cell Type': df_all_data['Level Two Cell Type'].nunique(),
        'Level One Cell Type': df_all_data['Level One Cell Type'].nunique()
    }
    for cell_type, count in unique_cell_types.items():
        print(f"{cell_type}: {count}")

    # Save the unique cell types containing "endothelial" in name per cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type) to a dictionary where the key is the level and the value is a list of unique cell types.
    endothelial_cell_types = {
        'Original Cell Type': df_all_data[df_all_data['Original Cell Type'].str.contains("endothelial", case=False, na=False)]['Original Cell Type'].unique().tolist(),
        'Level Three Cell Type': df_all_data[df_all_data['Level Three Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Three Cell Type'].unique().tolist(),
        'Level Two Cell Type': df_all_data[df_all_data['Level Two Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Two Cell Type'].unique().tolist(),
        'Level One Cell Type': df_all_data[df_all_data['Level One Cell Type'].str.contains("endothelial", case=False, na=False)]['Level One Cell Type'].unique().tolist()
    }   

    print("\nEndothelial cell types per cell type annotation level:")
    for level, cell_types in endothelial_cell_types.items():
        print(f"\n{level}:")
        for cell in cell_types:
            print(f"  - {cell}")

    nodes_dir = dataset_path
    edges_dir = output_edge_path

    type_field_list = ["Level Three Cell Type", "Level Two Cell Type", "Level One Cell Type"] # Skipping Original Cell Type as it is not a hierarchical level.

    # Set parameters for edge computation.
    threshold = 200
    report_progress = False  # Set to True to see progress updates.

    # For all files in the nodes_dir directory that end with -nodes.csv.
    # For each cell type level, for each unique cell type, compute edges and save to a file. 
    # Skip if level is not in type_field_list.
    for level, anchor_cell_types in endothelial_cell_types.items():
        if level not in type_field_list:
            print(f"Skipping {level} as it is not in type_field_list.")
            continue
        for anchor_cell_type in anchor_cell_types:
            print(f"Computing edges for {anchor_cell_type} at {level} level.")
            for filename in os.listdir(nodes_dir):
                if filename.endswith('.csv'):
                    nodes = pd.read_csv(f'{nodes_dir}/{filename}')
                    edges = compute_edges_from_nodes(nodes, anchor_cell_type, threshold, type_field=level, report_progress=report_progress)
                    # Replace -nodes.csv with -edges.csv in filename.
                    filename = filename.replace('-nodes', f'-{level}-{anchor_cell_type}-edges')
                    edges.to_csv(f'{edges_dir}/{filename}', index=False, header=False)

    print(f"Edges computed and saved to {edges_dir} for dataset {dataset_dir}.\n")

Processing dataset: intestine-codex-stanford
Total number of files read: 64
Total number of cells in merged DataFrame: 2512185
Total number of unique cell types per cell type annnotation level:
Original Cell Type: 25
Level Three Cell Type: 25
Level Two Cell Type: 17
Level One Cell Type: 5

Endothelial cell types per cell type annotation level:

Original Cell Type:
  - Endothelial

Level Three Cell Type:
  - endothelial cell of lymphatic vessel
  - endothelial cell

Level Two Cell Type:
  - endothelial cell of lymphatic vessel
  - endothelial cell

Level One Cell Type:
  - endothelial cell
Skipping Original Cell Type as it is not in type_field_list.
Computing edges for endothelial cell of lymphatic vessel at Level Three Cell Type level.
Computing edges for endothelial cell at Level Three Cell Type level.
Computing edges for endothelial cell of lymphatic vessel at Level Two Cell Type level.
Computing edges for endothelial cell at Level Two Cell Type level.
Computing edges for endothelial

## Compute distance values for all datasets.

In [12]:
def compute_euclidean_distance(x1, y1, z1, x2, y2, z2):
	"""Compute Euclidean distance between two points in 3D space."""
	return np.sqrt((x1 - x2)**2 + (y1 - y2)**2 + (z1 - z2)**2)

def compute_and_write_distance_values_for_all_edges_in_a_dataset(basepath, output_edge_dataset_path):
    file_count = 0
    for file in os.listdir(os.path.join(basepath, output_edge_dataset_path)):
        if file.endswith("-edges.csv"):
            file_path = os.path.join(basepath, output_edge_dataset_path, file)
            print(f"Computing distances for file: {file_path}")
            df = load_data(file_path, edges=True)
            df['distance'] = compute_euclidean_distance(df['x1'], df['y1'], df['z1'], df['x2'], df['y2'], df['z2'])
            # Add column names to the DataFrame.
            df.columns = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2', 'distance']
            df.to_csv(f'{file_path}', index=False, header=True)
            file_count += 1
    print(f"Total number of files processed for distance computation: {file_count}")

In [13]:
for dataset_dir in os.listdir(os.path.join(basepath, output_edge_dir)):
    # dataset_path = os.path.join(basepath, data_filedir, dataset_dir)
    print(f"Processing distance computation for dataset: {dataset_dir}")
    output_edge_dataset_path = os.path.join(output_edge_dir, dataset_dir)
    compute_and_write_distance_values_for_all_edges_in_a_dataset(basepath, output_edge_dataset_path)

Processing distance computation for dataset: intestine-codex-stanford
Computing distances for file: /u/yashjain/hra-cell-distance-analysis/data/data-processed-edges/intestine-codex-stanford/B004_Ascending-Level Three Cell Type-endothelial cell of lymphatic vessel-edges.csv
Computing distances for file: /u/yashjain/hra-cell-distance-analysis/data/data-processed-edges/intestine-codex-stanford/B005_Ascending-Level Three Cell Type-endothelial cell of lymphatic vessel-edges.csv
Computing distances for file: /u/yashjain/hra-cell-distance-analysis/data/data-processed-edges/intestine-codex-stanford/B006_Ascending-Level Three Cell Type-endothelial cell of lymphatic vessel-edges.csv
Computing distances for file: /u/yashjain/hra-cell-distance-analysis/data/data-processed-edges/intestine-codex-stanford/B009_Right-Level Three Cell Type-endothelial cell of lymphatic vessel-edges.csv
Computing distances for file: /u/yashjain/hra-cell-distance-analysis/data/data-processed-edges/intestine-codex-stanfor