# Compute Kernel for CellRank 

**RNA velocity** [Bergen et al., 2021](https://www.embopress.org/doi/full/10.15252/msb.202110282)  
Beyond the scope of computational modeling, the statistical power of the methods depends on the curvature in the phase portrait since a lack of curvature challenges current models to distinguish whether an up- or down-regulation is occurring. The overall curvature of deviation from the steady-state line in the phase portrait is mostly impacted by the ratios of splicing to degradation rates (Box 1), **indicating that statistical inference is limited to genes where splicing is faster or comparable to degradation, while a small ratio would yield straight lines rather than an interpretable curvature.**

In [None]:
import scvelo as scv
import cellrank as cr
import palantir as pt
import scanpy as sc
from scipy import sparse
import numpy as np
import pandas as pd

import os

In [None]:
# rpy2 
os.environ['R_HOME'] = '/home/fdeckert/bin/miniconda3/envs/p.3.8.12-FD20200109SPLENO/lib/R'

In [None]:
sc.settings.vector_friendly = False

sc.set_figure_params(figsize=(2, 3), dpi_save=1200, fontsize=8, frameon=False)
sc.settings.figdir = 'result/figures/'

scv.set_figure_params(figsize=(2, 3), dpi_save=1200, fontsize=8, frameon=False)
scv.settings.figdir = 'result/figures/'

In [None]:
sc.settings.verbosity = 0
scv.settings.verbosity = 0

In [None]:
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')

In [None]:
# Plotting 
import rpy2.robjects as robjects
color_load = robjects.r.source('plotting_global.R')
color = dict()
for i in range(len(color_load[0])):
    color[color_load[0].names[i]] = {key : color_load[0][i].rx2(key)[0] for key in color_load[0][i].names}

# Import data 

In [None]:
# Adata 
adata_pp = sc.read_h5ad('data/object/pp.h5ad').raw.to_adata()
adata_v = sc.read_h5ad('data/object/velocyto.h5ad')

# Meta data 
obs = pd.read_csv('data/object/components/meta.csv', index_col=0)
umap = pd.read_csv('data/object/components/umap.csv', index_col=0)

# Genes intersect
genes = adata_pp.var_names.intersection(adata_v.var_names)

In [None]:
def set_adata(adata, obs=obs, genes=genes, umap=umap): 
    
    # Add meta data obs
    adata = adata[adata.obs.index.isin(obs.index.tolist())]
    obs = obs.reindex(adata.obs_names)
    adata.obs = obs
    
    # Select genes 
    adata = adata[:, adata.var_names.isin(genes)]
    
    # Add umap embeddings 
    umap = umap[umap.index.isin(obs.index.tolist())]
    umap = umap.reindex(adata.obs_names)
    adata.obsm['X_umap'] = umap.to_numpy()
    
    return(adata)

In [None]:
adata_pp = set_adata(adata_pp)
adata_v = set_adata(adata_v)

# Set colors

In [None]:
def set_color(adata, categories): 
    
    categories = [x for x in categories if x in list(adata.obs.columns)]
    for category in categories: 
        
        adata.obs[category] = pd.Series(adata.obs[category], dtype='category')
        
        keys = list(color[category].keys())
        keys = [x for x in keys if x in list(adata.obs[category])]

        adata.obs[category] = adata.obs[category].cat.reorder_categories(keys)
        adata.uns[category+'_colors'] = np.array([color[category].get(key) for key in keys], dtype=object)

In [None]:
# Set colors
set_color(adata_pp, list(color.keys()))
set_color(adata_v, list(color.keys()))

# Combine adata

In [None]:
adata = adata_v.copy()
adata_pp = adata_pp[adata.obs_names] # Sort adata_pp by adata obs names so that raw is in same order
adata.raw = adata_pp

# Store copy of adata

In [None]:
adata_tmp = adata.copy()

# Workflow for kernel pre-processing

In [None]:
#######################
### Velocity kernel ###
#######################
def vk_pp_workflow(adata, suffix='', n_neighbors=30, n_pcs=50, compute=False):
    
    if compute: 
            
        scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=None, subset_highly_variable=False)
        
        scv.pp.moments(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, use_highly_variable=False)
        scv.tl.recover_dynamics(adata)

        scv.tl.velocity(adata, mode='dynamical')
        scv.tl.velocity_graph(adata)

        scv.tl.latent_time(adata, min_likelihood=0.1)
        scv.tl.velocity_confidence(adata) 
        
        # CellRank kernel 
        from cellrank.tl.kernels import VelocityKernel
        vk = VelocityKernel(adata)
        
        vk.write('data/object/cellrank/kernel/vk'+suffix, write_adata=True)
        
    else: 
        
        from cellrank.tl.kernels import VelocityKernel
        vk = VelocityKernel.read(fname='data/object/cellrank/kernel/vk'+suffix+'.pickle')
        
    return vk

In [None]:
###########################
### Connectivity kernel ###
###########################
def ck_pp_workflow(adata, suffix='', n_neighbors=30, n_pcs=50, compute=False):
    
    if compute: 
        
        # Set raw data as default 
        adata = adata.raw.to_adata()
        adata.X = adata.X.astype(int)
        adata.raw = adata
        
        # Filter genes
        sc.pp.filter_genes(adata, min_counts=10)
        
        # Normalize and scale 
        sc.pp.normalize_total(adata)
        sc.pp.log1p(adata)

        # Dim reduction, SNN, and leiden 
        sc.tl.pca(adata, n_comps=50)
        sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
        
        # CellRank kernel 
        from cellrank.tl.kernels import ConnectivityKernel
        ck = ConnectivityKernel(adata)
        
        ck.write('data/object/cellrank/kernel/ck'+suffix, write_adata=True)
        
    else: 
        
        from cellrank.tl.kernels import ConnectivityKernel
        ck = ConnectivityKernel.read(fname='data/object/cellrank/kernel/ck'+suffix+'.pickle')
        
    return ck

In [None]:
########################
### CytoTrace kernel ###
########################
def ctk_pp_workflow(adata, suffix='', compute=False):
    
    if compute: 
        
        # Set raw data as default 
        adata = adata.raw.to_adata()
        adata.X = adata.X.astype(int)
        adata.raw = adata
        
        # Filter genes 
        sc.pp.filter_genes(adata, min_counts=10)

        # Imputation 
        adata.layers['spliced'] = adata.X
        adata.layers['unspliced'] = adata.X
        scv.pp.moments(adata, n_neighbors=30, n_pcs=50, use_rep='X_pca')
        
        # CellRank kernel 
        from cellrank.tl.kernels import CytoTRACEKernel
        ctk = CytoTRACEKernel(adata)
        
        ctk.write('data/object/cellrank/kernel/ctk'+suffix, write_adata=True)
        
    else: 
        
        from cellrank.tl.kernels import CytoTRACEKernel
        ctk = CytoTRACEKernel.read(fname='data/object/cellrank/kernel/ctk'+suffix+'.pickle')
        
    return ctk

In [None]:
#######################
### Palantir kernel ###
#######################
def pk_workflow(adata, suffix, s_genes, t_genes, knn=30, compute=False): 
    
    if compute: 
        
        # Set raw data as default 
        adata = adata.raw.to_adata()
        adata.raw = adata

        # Normaization and PCA
        sc.pp.normalize_per_cell(adata)
        pt.preprocess.log_transform(adata)
        sc.pp.pca(adata, n_comps=50, use_highly_variable=False) # Recommend to use PCS that explain 85% of the variance 
        sc.pp.neighbors(adata, n_pcs=0, n_neighbors=knn) # That is the implementation as in run_diffusion_maps. Needs to be present in adata for pseudotime kernel 
        
        # Compute start cell
        sc.tl.score_genes(adata, s_genes, score_name='s_score')
        s_id=adata.obs['s_score'].idxmax()
        
        # Compute terminal state cells
        for k, v in t_genes.items():
            sc.tl.score_genes(adata, v, score_name='t_score_'+ k.lower())
            t_cell=adata.obs['t_score_'+ k.lower()].idxmax()
            t_genes[k] = t_cell
        t_id = pd.Series(list(t_genes.keys()), list(t_genes.values()))

        # Compute diffusion components
        pca_projections = pd.DataFrame(adata.obsm['X_pca'], index=adata.obs_names)
        dm_res = pt.utils.run_diffusion_maps(pca_projections, n_components=5, knn=knn, alpha=0)

        # Low dimensional embedding of the data based on the eigen gap
        ms_data = pt.utils.determine_multiscale_space(dm_res, n_eigs=None)

        # Data imputation with MAGIC (only for visualization )
        adata.X=sparse.csr_matrix(adata.X) 
        adata.layers['MAGIC_imputed_data'] = pt.utils.run_magic_imputation(adata, dm_res) 

        # Visualize diffusion components (only for visualization)
        umap = pd.DataFrame(adata.obsm['X_umap'], index=adata.obs_names, columns=['x', 'y'])

        # Running Palantir 
        pr_res = pt.core.run_palantir(ms_data, s_id, num_waypoints=500, terminal_states=t_id.index)
        pr_res.branch_probs.columns = t_id[pr_res.branch_probs.columns]

        # Combine adata with pseudotime 
        adata.obs['ppt_pseudotime'] = pr_res.pseudotime

        # CellRank kernel 
        from cellrank.tl.kernels import PseudotimeKernel
        pk = PseudotimeKernel(adata, time_key='ppt_pseudotime')

        pk.write('data/object/cellrank/kernel/pk'+suffix, write_adata=True)
    
    else: 
        
        from cellrank.tl.kernels import PseudotimeKernel
        pk = PseudotimeKernel.read(fname='data/object/cellrank/kernel/pk'+suffix+'.pickle')
    
    return pk

# Compute embedding and pseudotime for progenitors

In [None]:
cell_type_prog = [
    
    'MastP',
    'MegP', 
    'MEP (1)', 
    'MEP (2)', 
    'MEP (3)',
    'MEP (4)', 
    'ProEB (1)',
    'ProEB (2)',
    'ProEB (3)',
    'ProEB (4)',
    'EB (1)',
    'EB (2)',
    'EB (3)', 
    'EB (4)',
    'EB (5)'
    
]

In [None]:
# Subset anndata 
adata = adata[adata.obs['cell_type_fine'].isin(cell_type_prog)].copy()

In [None]:
ck = ck_pp_workflow(adata=adata, suffix='_prog', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_prog', compute=False)
pk = pk_workflow(adata=adata, suffix='_prog', s_genes=['Procr', 'Cd34'], t_genes=dict(Ery=['Hba-a1', 'Hba-a2'], Meg=['Pf4', 'Itga2b', 'Gp1bb'], Mast=['Gzmb', 'Cma1']), compute=False)

In [None]:
adata = ck.adata
adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']
adata.obs['ppt_pseudotime'] = pk.adata.obs['ppt_pseudotime']

In [None]:
sc.pl.umap(adata, color=['ct_pseudotime', 'ppt_pseudotime'], frameon=False, wspace=0.5, ncols=7)

# Kernel Progenitor (NaCl) 

In [None]:
cell_type_prog = [
    
    'MastP',
    'MegP', 
    'MEP (1)', 
    'MEP (2)', 
    'MEP (3)',
    'MEP (4)', 
    'ProEB (1)',
    'ProEB (2)',
    'ProEB (3)',
    'ProEB (4)',
    'EB (1)',
    'EB (2)',
    'EB (3)', 
    'EB (4)',
    'EB (5)'
    
]

In [None]:
# Subset anndata 
adata = adata_tmp[adata_tmp.obs['treatment']=='NaCl'].copy()
adata = adata[adata.obs['cell_type_fine'].isin(cell_type_prog)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_prog_nacl', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_prog_nacl', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_prog_nacl', compute=False)
pk = pk_workflow(adata=adata, suffix='_prog_nacl', s_genes=['Procr', 'Cd34'], t_genes=dict(Ery=['Hba-a1', 'Hba-a2'], Meg=['Pf4', 'Itga2b', 'Gp1bb'], Mast=['Gzmb', 'Cma1']), compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']
vk.adata.obs['ppt_pseudotime'] = pk.adata.obs['ppt_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['cell_type_fine', 'ct_pseudotime', 'ppt_pseudotime'], legend_loc='none', title='Velocity stream Ery (NaCl)', arrow_size=0.75)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime', 'ppt_pseudotime'], frameon=False, wspace=0.5, ncols=7, size=20)

# Kernel Progenitor (CpG) 

In [None]:
cell_type_prog = [
    
    'MastP',
    'MegP', 
    'MEP (1)', 
    'MEP (2)', 
    'MEP (3)',
    'MEP (4)', 
    'ProEB (1)',
    'ProEB (2)',
    'ProEB (3)',
    'ProEB (4)',
    'EB (1)',
    'EB (2)',
    'EB (3)', 
    'EB (4)',
    'EB (5)'
    
]    

In [None]:
adata = adata_tmp[adata_tmp.obs['treatment']=='CpG'].copy()
adata = adata[adata.obs['cell_type_fine'].isin(cell_type_prog)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_prog_cpg', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_prog_cpg', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_prog_cpg', compute=False)
pk = pk_workflow(adata=adata, suffix='_prog_cpg', s_genes=['Procr', 'Cd34'], t_genes=dict(Ery=['Hba-a1', 'Hba-a2'], Meg=['Pf4', 'Itga2b', 'Gp1bb'], Mast=['Gzmb', 'Cma1']), compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']
vk.adata.obs['ppt_pseudotime'] = pk.adata.obs['ppt_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['cell_type_fine'], legend_loc='none', title='Velocity stream Ery (CpG)', arrow_size=0.75)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime', 'ppt_pseudotime'], frameon=False, wspace=0.5, ncols=6, size=20)

# Kernel myeloid (NaCl)

In [None]:
cell_type_m = [
    
    "cMo (1)", 
    "cMo (2)",
    "ncMo (1)", 
    "ncMo (2)",
    "PreRPM",
    "RPM", 
    "cDC1 (1)", 
    "cDC1 (2)",
    "cDC2 (1)",
    "cDC2 (2)", 
    "cDC2 (3)"
    
]

In [None]:
adata = adata_tmp[adata_tmp.obs['treatment']=='NaCl'].copy()
adata = adata[adata.obs['cell_type_fine'].isin(cell_type_m)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_m_nacl', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_m_nacl', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_m_nacl', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['cell_type_fine'], legend_loc='none', title='Velocity stream Myeloid (NaCl)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)

# Kernel myeloid (CpG)

In [None]:
cell_type_m = [
    
    "cMo (1)", 
    "cMo (2)",
    "ncMo (1)", 
    "ncMo (2)",
    "PreRPM",
    "RPM", 
    "cDC1 (1)", 
    "cDC1 (2)",
    "cDC2 (1)",
    "cDC2 (2)", 
    "cDC2 (3)"
    
]    

In [None]:
adata = adata_tmp[adata_tmp.obs['treatment']=='CpG'].copy()
adata = adata[adata.obs['cell_type_fine'].isin(cell_type_m)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_m_cpg', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_m_cpg', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_m_cpg', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['cell_type_fine'], legend_loc='none', title='Velocity stream Myeloid (CpG)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)

# Kernel myeloid subset (NaCl)

In [None]:
cell_type_mo_subset = [
    
    "cDC2 (1)",
    "cMo (1)", 
    "cMo (2)",
    "ncMo (1)", 
    "PreRPM",
    "RPM"
    
]     

In [None]:
adata = adata_tmp[adata_tmp.obs['treatment']=='NaCl'].copy()
adata = adata[adata.obs['cell_type_fine'].isin(cell_type_mo_subset)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_m_subset_nacl', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_m_subset_nacl', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_m_subset_nacl', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['cell_type_fine'], legend_loc='none', title='Velocity stream Myeloid (NaCl)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)

# Kernel myeloid subset (CpG)

In [None]:
cell_type_mo_subset = [
    
    "cDC2 (1)",
    "cMo (1)", 
    "cMo (2)",
    "ncMo (1)", 
    "PreRPM",
    "RPM"
    
]     

In [None]:
adata = adata_tmp[adata_tmp.obs['treatment']=='CpG'].copy()
adata = adata[adata.obs['cell_type_fine'].isin(cell_type_mo_subset)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_m_subset_cpg', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_m_subset_cpg', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_m_subset_cpg', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['cell_type_fine'], legend_loc='none', title='Velocity stream Myeloid (CpG)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)

# Kernel monocytes (NaCl)

In [None]:
cell_type_mo = [
    
    "cMo (1)", 
    "cMo (2)",
    "PreRPM",
    "RPM"
    
]

In [None]:
adata = adata_tmp[adata_tmp.obs['treatment']=='NaCl'].copy()
adata = adata[adata.obs['cell_type_fine'].isin(cell_type_mo)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_mo_nacl', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_mo_nacl', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_mo_nacl', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['cell_type_fine'], legend_loc='none', title='Velocity stream Myeloid (NaCl)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)

# Kernel myeloid (CpG)

In [None]:
cell_type_mo = [
    
    "cMo (1)", 
    "cMo (2)",
    "PreRPM",
    "RPM"
    
]    

In [None]:
adata = adata_tmp[adata_tmp.obs['treatment']=='CpG'].copy()
adata = adata[adata.obs['cell_type_fine'].isin(cell_type_mo)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_mo_cpg', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_mo_cpg', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_mo_cpg', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['cell_type_fine'], legend_loc='none', title='Velocity stream Myeloid (CpG)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)