## Refined start node selection strategy through CytoTRACE2

In [1]:
import numpy as np
import scanpy as sc
import scvelo as scv
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append("..")
from tivelo.path.prune import graph_prune
from tivelo.path.start_node import get_start_node

In [2]:
data_name = "dentategyrus"
data_path = "/lustre/project/Stat/s1155184322/datasets/velocity/10X43_1_processed.h5ad"
adata = sc.read(data_path)

group_key = "clusters"
emb_key = "X_umap"
cluster_edges = [("nIPC", "Neuroblast"), ("Neuroblast", "Granule immature"), ("Granule immature", "Granule mature"),
                 ('OPC', 'OL')]

In [3]:
select_info, select_weight = graph_prune(adata, group_key, emb_key)
start_node = get_start_node(adata, select_info, group_key, njobs=16, mode="stochastic")

computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 16/128 cores)


  0%|          | 0/2930 [00:00<?, ?cells/s]

    finished (0:00:09) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing terminal states
    identified 4 regions of root cells and 5 regions of end points .
    finished (0:00:00) --> added
    'root_cells', root cells of Markov diffusion process (adata.obs)
    'end_points', end points of Markov diffusion process (adata.obs)


In [4]:
start_node

'nIPC'

Now we change start_node to another one "Astrocytes".

In [5]:
start_node = "Astrocytes"

Here we run the following code by cytotrace2_py==1.1.0.1 in an independent environment. Note that we need to use raw dataset without filtering genes in CytoTRACE2.

In [None]:
# from cytotrace2_py.cytotrace2_py import *
# import os

# data_name = "DentateGyrus"
# data_path = "/lustre/project/Stat/s1155184322/datasets/velocity/raw_data/10X43_1.h5ad"
# adata = sc.read(data_path)
# group_key = "clusters"
# emb_key = "X_umap"

Create the directory if it doesn't exist

In [None]:
# os.makedirs(f"data/{data_name}", exist_ok=True)

# data = adata.X.T.A.astype(int)
# var_names = adata.var_names
# obs_index = adata.obs.index

# with open(f"data/{data_name}/{data_name}_data.txt", "w") as f:
#     f.write("\t" + "\t".join(obs_index) + "\n")
#     for var_name, row in zip(var_names, data):
#         f.write(var_name + "\t" + "\t".join(map(str, row)) + "\n")

Open the file in write mode

In [None]:
# with open(f"data/{data_name}/{data_name}_annotation.txt", "w") as f:
#     f.write("phenotype\n")
#     for index, phenotype in zip(adata.obs.index, adata.obs[group_key]):
#         f.write(f"{index}\t{phenotype}\n")

Run CytoTRACE2

In [None]:
# input_path = f"data/{data_name}/{data_name}_data.txt"
# example_annotation_path = f"data/{data_name}/{data_name}_annotation.txt"
# example_species = "mouse"

# results =  cytotrace2(input_path,
#                      annotation_path=example_annotation_path,
#                      species=example_species,
#                      output_dir=f"cytotrace2_results/{data_name}",)

Read the results of CytoTRACE2

In [6]:
cytotrace_results = pd.read_csv("/users/s1155184322/projects/tutorial/cytotrace2/cytotrace2_results/DentateGyrus/cytotrace2_results.txt",
                                sep="\t", index_col=0)

In [7]:
cytotrace_results[group_key] = adata.obs[group_key]

In [8]:
median_potency = cytotrace_results.groupby('clusters')['CytoTRACE2_Score'].median()

In [9]:
max_node = median_potency.index[median_potency.values.argmax()]
min_node = median_potency.index[median_potency.values.argmin()]

Correct the start node according to potency score inferred by CytoTRACE2

In [10]:
if start_node != max_node or start_node != min_node:
    start_node = max_node
start_node

'nIPC'