# Compute Kernel for CellRank 

**RNA velocity** [Bergen et al., 2021](https://www.embopress.org/doi/full/10.15252/msb.202110282)  
Beyond the scope of computational modeling, the statistical power of the methods depends on the curvature in the phase portrait since a lack of curvature challenges current models to distinguish whether an up- or down-regulation is occurring. The overall curvature of deviation from the steady-state line in the phase portrait is mostly impacted by the ratios of splicing to degradation rates (Box 1), **indicating that statistical inference is limited to genes where splicing is faster or comparable to degradation, while a small ratio would yield straight lines rather than an interpretable curvature.**

In [None]:
import scvelo as scv
import cellrank as cr
import palantir as pt
import scanpy as sc
from scipy import sparse
import numpy as np
import pandas as pd

import os

In [None]:
# rpy2 
os.environ['R_HOME'] = '/nobackup/peer/fdeckert/miniconda3/envs/r.4.1.0/lib/R'

In [None]:
sc.settings.vector_friendly = False

sc.set_figure_params(figsize=(2, 3), dpi_save=1200, fontsize=8, frameon=False)
sc.settings.figdir = 'result/figures/'

scv.set_figure_params(figsize=(2, 3), dpi_save=1200, fontsize=8, frameon=False)
scv.settings.figdir = 'result/figures/'

In [None]:
sc.settings.verbosity = 0
scv.settings.verbosity = 0

In [None]:
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')

In [None]:
# Plotting 
import rpy2.robjects as robjects
color_load = robjects.r.source('plotting_global.R')
color = dict()
for i in range(len(color_load[0])):
    color[color_load[0].names[i]] = {key : color_load[0][i].rx2(key)[0] for key in color_load[0][i].names}

# Import data 

In [None]:
# Adata 
adata_pp = sc.read_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/pp.h5ad').raw.to_adata()
adata_v = sc.read_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/velocyto.h5ad')

obs = adata_pp.obs
umap = pd.DataFrame(adata_pp.obsm['X_umap'], index=obs.index)
latent = pd.DataFrame(adata_pp.obsm['latent'], index=obs.index)

# Genes intersect
genes = adata_pp.var_names.intersection(adata_v.var_names)

In [None]:
def set_adata(adata, obs=obs, genes=genes, umap=umap, latent=latent): 
    
    # Add meta data obs
    adata = adata[adata.obs.index.isin(obs.index.tolist())]
    obs = obs.reindex(adata.obs_names)
    adata.obs = obs
    
    # Select genes 
    adata = adata[:, adata.var_names.isin(genes)]
    
    # Make copy 
    adata = adata.copy()
    
    # Add umap embeddings 
    umap = umap[umap.index.isin(obs.index.tolist())]
    umap = umap.reindex(adata.obs_names)
    adata.obsm['X_umap'] = umap.to_numpy()
    
    # Add latent embeddings
    latent = latent[latent.index.isin(obs.index.tolist())]
    latent = latent.reindex(adata.obs_names)
    adata.obsm['X_latent'] = latent.to_numpy()
    
    return(adata)

In [None]:
adata_pp = set_adata(adata_pp)
adata_v = set_adata(adata_v)

# Set colors

In [None]:
def set_color(adata, categories): 
    
    categories = [x for x in categories if x in list(adata.obs.columns)]
    for category in categories: 
        
        adata.obs[category] = pd.Series(adata.obs[category], dtype='category')
        
        keys = list(color[category].keys())
        keys = [x for x in keys if x in list(adata.obs[category])]

        adata.obs[category] = adata.obs[category].cat.reorder_categories(keys)
        adata.uns[category+'_colors'] = np.array([color[category].get(key) for key in keys], dtype=object)

In [None]:
# Set colors
set_color(adata_pp, list(color.keys()))
set_color(adata_v, list(color.keys()))

# Combine adata

In [None]:
adata = adata_v.copy()
adata_pp = adata_pp[adata.obs_names] # Sort adata_pp by adata obs names so that raw is in same order
adata.raw = adata_pp

# Store copy of adata

In [None]:
adata_tmp = adata.copy()

# Workflow for kernel pre-processing

In [None]:
###########################
### Connectivity kernel ###
###########################
def ck_pp_workflow(adata, suffix='', n_neighbors=30, n_pcs=50, compute=False):
    
    if compute: 
        
        # Set raw data as default 
        adata = adata.raw.to_adata()
        adata.X = adata.X.astype(int)
        adata.raw = adata
        
        # Filter genes
        # sc.pp.filter_genes(adata, min_counts=10)
        
        # Normalize and scale 
        sc.pp.normalize_total(adata)
        sc.pp.log1p(adata)

#         # Dim reduction, SNN, and leiden 
#         sc.tl.pca(adata, n_comps=50)
#         sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
        
#         # CellRank kernel 
#         from cellrank.tl.kernels import ConnectivityKernel
#         ck = ConnectivityKernel(adata)
        
#         ck.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/ck'+suffix, write_adata=True)
        
#     else: 
        
#         from cellrank.tl.kernels import ConnectivityKernel
#         ck = ConnectivityKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/ck'+suffix+'.pickle')
        
#     return ck

In [None]:
########################
### CytoTrace kernel ###
########################
def ctk_pp_workflow(adata, suffix='', compute=False):
    
    if compute: 
        
        # Set raw data as default 
        adata = adata.raw.to_adata()
        adata.X = adata.X.astype(int)
        adata.raw = adata
        
        # Filter genes 
        sc.pp.filter_genes(adata, min_counts=10)

        # Imputation 
        adata.layers['spliced'] = adata.X
        adata.layers['unspliced'] = adata.X
        scv.pp.moments(adata, n_neighbors=30, n_pcs=50, use_rep='X_pca')
        
        # CellRank kernel 
        from cellrank.tl.kernels import CytoTRACEKernel
        ctk = CytoTRACEKernel(adata)
        
        ctk.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/ctk'+suffix, write_adata=True)
        
    else: 
        
        from cellrank.tl.kernels import CytoTRACEKernel
        ctk = CytoTRACEKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/ctk'+suffix+'.pickle')
        
    return ctk

In [None]:
#######################
### Palantir kernel ###
#######################
def pk_workflow(adata, suffix, s_genes, t_genes, knn=30, compute=False): 
    
    if compute: 
        
        # Set raw data as default 
        adata = adata.raw.to_adata()
        adata.raw = adata

        # Normaization and PCA
        sc.pp.normalize_per_cell(adata)
        adata.X = adata.X.toarray()
        pt.preprocess.log_transform(adata)
        sc.pp.pca(adata, n_comps=50, use_highly_variable=False) # Recommend to use PCS that explain 85% of the variance 
        sc.pp.neighbors(adata, n_pcs=0, n_neighbors=knn) # That is the implementation as in run_diffusion_maps. Needs to be present in adata for pseudotime kernel 
        
        # Compute start cell
        sc.tl.score_genes(adata, s_genes, score_name='s_score')
        s_id=adata.obs['s_score'].idxmax()
        
        # Compute terminal state cells
        for k, v in t_genes.items():
            sc.tl.score_genes(adata, v, score_name='t_score_'+ k.lower())
            t_cell=adata.obs['t_score_'+ k.lower()].idxmax()
            t_genes[k] = t_cell
        t_id = pd.Series(list(t_genes.keys()), list(t_genes.values()))

        # Compute diffusion components
        pca_projections = pd.DataFrame(adata.obsm['X_pca'], index=adata.obs_names)
        dm_res = pt.utils.run_diffusion_maps(pca_projections, n_components=5, knn=knn, alpha=0)

        # Low dimensional embedding of the data based on the eigen gap
        ms_data = pt.utils.determine_multiscale_space(dm_res, n_eigs=None)

        # Data imputation with MAGIC (only for visualization )
        adata.X=sparse.csr_matrix(adata.X) 
        adata.layers['MAGIC_imputed_data'] = pt.utils.run_magic_imputation(adata, dm_res) 

        # Visualize diffusion components (only for visualization)
        umap = pd.DataFrame(adata.obsm['X_umap'], index=adata.obs_names, columns=['x', 'y'])

        # Running Palantir 
        pr_res = pt.core.run_palantir(ms_data, s_id, num_waypoints=500, terminal_states=t_id.index)
        pr_res.branch_probs.columns = t_id[pr_res.branch_probs.columns]

        # Combine adata with pseudotime 
        adata.obs['ppt_pseudotime'] = pr_res.pseudotime

        # CellRank kernel 
        from cellrank.tl.kernels import PseudotimeKernel
        pk = PseudotimeKernel(adata, time_key='ppt_pseudotime')

        pk.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/pk'+suffix, write_adata=True)
    
    else: 
        
        from cellrank.tl.kernels import PseudotimeKernel
        pk = PseudotimeKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/pk'+suffix+'.pickle')
    
    return pk

In [186]:
#######################
### Velocity kernel ###
#######################
def vk_pp_workflow(adata, suffix='', compute=False):
    
    if compute: 
        
        # Filter and normalize counts
        scv.pp.filter_and_normalize(adata, min_shared_counts=20, subset_highly_variable=False, n_top_genes=adata.n_vars)
        
        # Compute neighbors on latent space
        sc.pp.neighbors(adata, n_neighbors=30, use_rep='X_latent')
        
        scv.pp.moments(adata, n_pcs=None, n_neighbors=None)
        scv.tl.recover_dynamics(adata, n_jobs=16)

        scv.tl.velocity(adata, mode='dynamical')
        scv.tl.velocity_graph(adata)

        scv.tl.latent_time(adata, min_likelihood=0.1)
        scv.tl.velocity_confidence(adata) 
        
        # CellRank kernel 
        from cellrank.tl.kernels import VelocityKernel
        vk = VelocityKernel(adata)
        
        vk.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/vk'+suffix, write_adata=True)
        
#     else: 
        
#         from cellrank.tl.kernels import VelocityKernel
#         vk = VelocityKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/vk'+suffix+'.pickle')
        
#     return vk

# Compute embedding and pseudotime for progenitors

# Kernel Progenitor (Baseline) 

In [187]:
cell_type_prog = [
    
    'GP', 
    'MegP', 
    'MEP', 
    'ProEB', 
    'EB I',
    'EB II'
    'EB III'
    
]

In [188]:
# Subset anndata 
adata = adata_tmp[adata_tmp.obs['infection']=='Baseline']
adata = adata[adata.obs['leiden_cell_type_main'].isin(cell_type_prog)]
adata = adata[:, (adata.X>=1).sum(axis=0)>=3].copy()

In [189]:
vk = vk_pp_workflow(adata=adata, suffix='_prog_baseline', compute=True)
# ck = ck_pp_workflow(adata=adata.copy(), suffix='_prog_baseline', compute=True)
# ctk = ctk_pp_workflow(adata=adata.copy(), suffix='_prog_baseline', compute=True)
# pk = pk_workflow(adata=adata.copy(), suffix='_prog_baseline', s_genes=['Cd34', 'Kit', 'Gata2'], t_genes=dict(Ery=['Hba-a1', 'Hba-a2'], Meg=['Pf4', 'Itga2b', 'Gp1bb'], MP=['Spi1', 'Prtn3']), compute=True)

  log1p(adata)


  0%|          | 0/1115 [00:00<?, ?gene/s]

  0%|          | 0/4347 [00:00<?, ?cells/s]

ModuleNotFoundError: No module named 'cellrank.tl'

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']
vk.adata.obs['ppt_pseudotime'] = pk.adata.obs['ppt_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['leiden_cell_type_main', 'ct_pseudotime', 'ppt_pseudotime'], legend_loc='none', title='Velocity stream Prog (Baseline)', arrow_size=0.75)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime', 'ppt_pseudotime'], frameon=False, wspace=0.5, ncols=7, size=20)

# Kernel Progenitor (CpG) 

In [None]:
cell_type_prog = [
    
    'MP', 
    'MegP', 
    'MEP I', 
    'MEP II', 
    'MEP III',
    'ProEB I',
    'ProEB II',
    'ProEB III',
    'ProEB IV',
    'EB I',
    'EB II',
    'EB III', 
    'EB IV',
    'EB V'
    
]    

In [None]:
adata = adata_tmp[adata_tmp.obs['infection']=='CpG'].copy()
adata = adata[adata.obs['leiden_cell_type_main'].isin(cell_type_prog)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_prog_cpg', compute=True)
ck = ck_pp_workflow(adata=adata, suffix='_prog_cpg', compute=True)
ctk = ctk_pp_workflow(adata=adata, suffix='_prog_cpg', compute=True)
pk = pk_workflow(adata=adata, suffix='_prog_cpg', s_genes=['Cd34', 'Kit', 'Gata2'], t_genes=dict(Ery=['Hba-a1', 'Hba-a2'], Meg=['Pf4', 'Itga2b', 'Gp1bb'], MP=['Spi1', 'Prtn3']), compute=True)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']
vk.adata.obs['ppt_pseudotime'] = pk.adata.obs['ppt_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['leiden_cell_type_main'], legend_loc='none', title='Velocity stream Prog (CpG)', arrow_size=0.75)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime', 'ppt_pseudotime'], frameon=False, wspace=0.5, ncols=6, size=20)

# Kernel myeloid (Baseline)

In [None]:
cell_type_m = [
    
    "cDC1",  
    "cDC2 I", 
    "cDC2 II",
    "ncMo I", 
    "ncMo II", 
    "ncMo III", 
    "cMo",
    "PreRPM", 
    "RPM", 
    "Granulocyte"
    
]

In [None]:
adata = adata_tmp[adata_tmp.obs['infection']=='Baseline'].copy()
adata = adata[adata.obs['leiden_cell_type_main'].isin(cell_type_m)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_m_baseline', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_m_baseline', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_m_baseline', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['leiden_cell_type_main'], legend_loc='none', title='Velocity stream Myeloid (Baseline)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)

# Kernel myeloid (CpG)

In [None]:
cell_type_m = [
    
    "cDC1",  
    "cDC2 I", 
    "cDC2 II",
    "ncMo I", 
    "ncMo II", 
    "ncMo III", 
    "cMo",
    "PreRPM", 
    "RPM", 
    "Granulocyte"
    
]    

In [None]:
adata = adata_tmp[adata_tmp.obs['infection']=='CpG'].copy()
adata = adata[adata.obs['leiden_cell_type_main'].isin(cell_type_m)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_m_cpg', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_m_cpg', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_m_cpg', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['leiden_cell_type_main'], legend_loc='none', title='Velocity stream Myeloid (CpG)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)

# Kernel myeloid subset (Baseline)

In [None]:
cell_type_mo_subset = [
    
    "cMo",
    "PreRPM", 
    "RPM"
    
]     

In [None]:
adata = adata_tmp[adata_tmp.obs['infection']=='Baseline'].copy()
adata = adata[adata.obs['leiden_cell_type_main'].isin(cell_type_mo_subset)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_rpm_diff_baseline', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_rpm_diff_baseline', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_rpm_diff_baseline', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['leiden_cell_type_main'], legend_loc='none', title='Velocity stream Myeloid (Baseline)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)

# Kernel myeloid subset (CpG)

In [None]:
cell_type_mo_subset = [
    
    "cMo",
    "PreRPM", 
    "RPM"
    
]     

In [None]:
adata = adata_tmp[adata_tmp.obs['infection']=='CpG'].copy()
adata = adata[adata.obs['leiden_cell_type_main'].isin(cell_type_mo_subset)].copy()

In [None]:
vk = vk_pp_workflow(adata=adata, suffix='_rpm_diff_cpg', compute=False)
ck = ck_pp_workflow(adata=adata, suffix='_rpm_diff_cpg', compute=False)
ctk = ctk_pp_workflow(adata=adata, suffix='_rpm_diff_cpg', compute=False)

In [None]:
vk.adata.obs['ct_pseudotime'] = ctk.adata.obs['ct_pseudotime']

In [None]:
scv.pl.velocity_embedding_stream(vk.adata, basis='X_umap', color=['leiden_cell_type_main'], legend_loc='none', title='Velocity stream Myeloid (CpG)', arrow_size=0.75, size=20)
sc.pl.umap(vk.adata, color=['root_cells', 'end_points', 'velocity_pseudotime', 'latent_time', 'ct_pseudotime'], frameon=False, wspace=0.5, ncols=5, size=20)