# Compute Kernel for CellRank 

**RNA velocity** [Bergen et al., 2021](https://www.embopress.org/doi/full/10.15252/msb.202110282)  
Beyond the scope of computational modeling, the statistical power of the methods depends on the curvature in the phase portrait since a lack of curvature challenges current models to distinguish whether an up- or down-regulation is occurring. The overall curvature of deviation from the steady-state line in the phase portrait is mostly impacted by the ratios of splicing to degradation rates (Box 1), **indicating that statistical inference is limited to genes where splicing is faster or comparable to degradation, while a small ratio would yield straight lines rather than an interpretable curvature.**  

**MURK genes** [GitHub](https://github.com/mebarile/Gata1_Erythroid_kinetics/blob/main/Figure_6/search_and_rank_murk_human.ipynb)

In [None]:
import scvelo as scv
import scanpy as sc
import scanpy.external as sce

import scanorama

import scipy
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import os

## Settup rpy2 

In [None]:
os.environ['R_HOME'] = '/nobackup/peer/fdeckert/miniconda3/envs/r.4.1.0/lib/R'

In [None]:
import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [None]:
%load_ext rpy2.ipython

In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
sc.set_figure_params(figsize=(5, 5), dpi_save=1200, fontsize=10, frameon=False, color_map='magma')
scv.set_figure_params(figsize=(5, 5), dpi_save=1200, fontsize=10, frameon=False, color_map='magma')

In [None]:
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')

In [None]:
# Plotting 
import rpy2.robjects as robjects

color_load = robjects.r.source('plotting_global.R')
color = dict()
for i in range(len(color_load[0])):
    color[color_load[0].names[i]] = {key : color_load[0][i].rx2(key)[0] for key in color_load[0][i].names}

In [None]:
def set_color(adata, categories=color.keys()): 
    
    categories = [x for x in categories if x in list(adata.obs.columns)]
    for category in categories: 
        
        adata.obs[category] = pd.Series(adata.obs[category], dtype='category')
        
        keys = list(color[category].keys())
        keys = [x for x in keys if x in list(adata.obs[category])]

        adata.obs[category] = adata.obs[category].cat.reorder_categories(keys)
        adata.uns[category+'_colors'] = np.array([color[category].get(key) for key in keys], dtype=object)

# Import data 

In [None]:
# Adata 
adata = sc.read_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/pp.h5ad')
adata_v = sc.read_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/velocyto.h5ad')

# Transfer velocyto matrices 

In [None]:
obs_names = adata.obs_names
var_names = adata.var_names.intersection(adata_v.var_names)

In [None]:
adata = adata[obs_names, var_names]
adata_v = adata_v[obs_names, var_names]

In [None]:
adata.X = adata_v.X
adata.layers = adata_v.layers

In [None]:
# Set color
set_color(adata)

In [None]:
# UMAP 
sc.pl.umap(sc.pp.normalize_total(adata.raw.to_adata(), copy=True), color=['leiden_cell_type_main', 'Cd34', 'Ly6a', 'Procr', 'Kit', 'Spi1', 'Klf1', 'Gata1', 'Gata2', 'Nfe2', 'Fli1', 'Pf4', 'Mpl', 'Gfi1', 'Cebpa', 'Irf8', 'Elane', 'Mcpt8', 'Prss34', 'Cd74', 'Csf1r', 'Car2', 'Dntt', 'Flt3', 'Lyz2'], legend_loc='on data', ncols=5, size=50, use_raw=False)

# Helper function

In [None]:
###########
### PCA ###
###########
def pca_workflow(adata, n_comps): 
    
    adata = adata.raw.to_adata()
    
    sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=2000, batch_key='sample_group', subset=False)
    
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    
    sc.pp.pca(adata, n_comps=n_comps, use_highly_variable=True)
    
    X_pca = adata.obsm['X_pca']
    
    return(X_pca)

In [None]:
#######################
### Velocity kernel ###
#######################
def vk_workflow(adata, suffix, compute=False):
    
    if compute: 
        
        # Compute PCA 
        adata.obsm['X_pca'] = pca_workflow(adata.copy(), n_comps=30)
        
        # Compute neighbors on PCA 
        sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30, use_rep='X_pca')
        
        # Normalize X, spliced, and unspliced
        sc.pp.normalize_total(adata, layers=['spliced', 'unspliced'], target_sum=1e6)
        sc.pp.log1p(adata)
            
        # Subset by var names filter
        adata = adata[:, adata.var['m_genes']].copy()

        # First-/second-order moments are computed for each cell across its nearest neighbors
        scv.pp.moments(adata, n_pcs=None, n_neighbors=None)
        scv.tl.recover_dynamics(adata)

        # Estimates velocities in a gene-specific manner
        scv.tl.velocity(adata, mode='dynamical')
            
        # Get velocity and top likelihood fit genes 
        v_df = adata.var
        v_df = v_df[v_df.velocity_genes]
        v_df = v_df[v_df.fit_likelihood>=0.1]
            
        v_genes = v_df.index
            
        # Velocity graph (re-compute)
        scv.tl.velocity_graph(adata, gene_subset=v_genes)

        # Velocity pseudotime 
        scv.tl.velocity_pseudotime(adata)
            
        # Terminal and root cells 
        scv.tl.recover_latent_time(adata)
            
        # Fit differential kinetics on velocity genes
        scv.tl.differential_kinetic_test(adata, var_names=adata.var_names[adata.var.velocity_genes], groupby='leiden_cell_type_main')

            
        # CellRank kernel 
        from cellrank.kernels import VelocityKernel
        vk = VelocityKernel(adata)
        
        vk.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/vk'+suffix+'.pickle', write_adata=True)
        
    else: 
        
        from cellrank.kernels import VelocityKernel
        vk = VelocityKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/vk'+suffix+'.pickle')
        
    return vk

In [None]:
#######################
### Palantir kernel ###
#######################
def pk_workflow(adata, suffix, s_genes, t_genes, compute=True):
    
    if compute:
        
        # Use the full matrix instead of velocity matrix 
        adata = adata.raw.to_adata()
        adata.raw = adata
        
        # Compute PCA 
        adata.obsm['X_pca'] = pca_workflow(adata.copy(), n_comps=300)
        
        # Compute neighbors on PCA 
        sc.pp.neighbors(adata, n_neighbors=30, n_pcs=300, use_rep='X_pca')
        
        # Palantir diffusion maps based on PCA
        sce.tl.palantir(adata, n_components=10, knn=30)

        # Compute starting cell
        for k, v in s_genes.items():
            sc.tl.score_genes(adata, v, score_name='t_score_'+ k.lower())
            t_cell=adata.obs['t_score_'+ k.lower()].idxmax()
            s_genes[k] = t_cell
        s_id = pd.Series(list(s_genes.keys()), list(s_genes.values()))

        # Compute terminal state cells
        for k, v in t_genes.items():
            sc.tl.score_genes(adata, v, score_name='t_score_'+ k.lower())
            t_cell=adata.obs['t_score_'+ k.lower()].idxmax()
            t_genes[k] = t_cell
        t_id = pd.Series(list(t_genes.keys()), list(t_genes.values()))

        adata.obs = adata.obs.join(pd.DataFrame({'pt_cells':pd.concat([s_id, t_id]).tolist()}, index=pd.concat([s_id, t_id]).index), how='left')
        adata.obs['pt_cells'] = adata.obs['pt_cells'].astype('category')
        
        # Palantir results 
        pr_res = sce.tl.palantir_results(
            adata,
            early_cell=s_id.index[0], 
            terminal_states=t_id, 
            ms_data='X_palantir_multiscale',
            num_waypoints=500, 
            use_early_cell_as_start=False
        )
        
        # Get pseudotime 
        adata.obs['ppt_pseudotime'] = pr_res.pseudotime
        
        # Use original data
        adata = adata.raw.to_adata()
        adata.raw = adata
        

        # CellRank kernel 
        from cellrank.kernels import PseudotimeKernel
        pk = PseudotimeKernel(adata, time_key='ppt_pseudotime')

        pk.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/pk'+suffix+'.pickle', write_adata=True)
    
    else: 
        
        from cellrank.kernels import PseudotimeKernel
        pk = PseudotimeKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/pk'+suffix+'.pickle')
    
    return pk

In [None]:
###########################
### Connectivity kernel ###
###########################
def ck_workflow(adata, suffix, compute=True):
    
    if compute: 
        
        # Neighbor graph on latent space 
        sc.pp.neighbors(adata, n_neighbors=30, n_pcs=None, use_rep='latent')
        
        # CellRank kernel 
        from cellrank.kernels import ConnectivityKernel
        ck = ConnectivityKernel(adata)
        
        ck.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/ck'+suffix+'.pickle', write_adata=True)
        
    else: 
        
        from cellrank.kernels import ConnectivityKernel
        ck = ConnectivityKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/ck'+suffix+'.pickle')
        
    return ck

# Kernel Progenitor

In [None]:
cell_type_prog = [
    
    'Meg', 
    'MEP', 
    'ProEB', 
    'EB I',
    'EB II', 
    'EB III'
    
]

In [None]:
# Subset by progenitor cluster 
adata = adata[(adata.obs.leiden_cell_type_main.isin(cell_type_prog)).values].copy()

In [None]:
# Set genes to include for moment computation 
adata.var['m_genes'] = adata.var.index.isin(scv.pp.filter_and_normalize(adata, min_shared_counts=20, subset_highly_variable=False, n_top_genes=None, log=False, copy=True).var_names)

## Kernel Progenitor (Baseline) 

In [None]:
adata_i = adata.copy()
adata_i = adata_i[(adata_i.obs.leiden_cell_type_main.isin(cell_type_prog)).values & (adata_i.obs.infection=='Baseline').values].copy()

In [None]:
vk_1 = vk_workflow(adata=adata_i.copy(), suffix='_prog_baseline', compute=False)
pk_1 = pk_workflow(adata=adata_i.copy(), suffix='_prog_baseline', compute=False, s_genes=dict(Start=['Cd34', 'Kit', 'Gata2']), t_genes=dict(Ery=['Hbb-bt'], Meg=['Pf4']))
ck_1 = ck_workflow(adata=adata_i.copy(), suffix='_prog_baseline', compute=False)

In [None]:
warnings.simplefilter("ignore")
sc.set_figure_params(figsize=(2.5, 5), dpi_save=1200, fontsize=10, color_map='magma', frameon=False)
scv.set_figure_params(figsize=(2.5, 5), dpi_save=1200, fontsize=10, color_map='magma', frameon=False)

In [None]:
scv.pl.velocity_embedding_stream(vk_1.adata, basis='X_umap', color=['leiden_cell_type_main', 'velocity_pseudotime', 'latent_time', 'root_cells', 'end_points', 'msS_RNA', 'msG2M_RNA'], legend_loc='on data', ncols=7, size=100, density=1.5, recompute=True)

In [None]:
scv.pl.umap(pk_1.adata, color=['leiden_cell_type_main', 'ppt_pseudotime', 'pt_cells', 't_score_start', 't_score_meg', 't_score_ery'], legend_loc='on data', ncols=6, size=100)

## Kernel Progenitor (CpG) 

In [None]:
adata_i = adata.copy()
adata_i = adata_i[(adata_i.obs.leiden_cell_type_main.isin(cell_type_prog)).values & (adata_i.obs.infection=='CpG').values].copy()

In [None]:
vk_2 = vk_workflow(adata=adata_i.copy(), suffix='_prog_cpg', compute=False)
pk_2 = pk_workflow(adata=adata_i.copy(), suffix='_prog_cpg', compute=False, s_genes=dict(Start=['Cd34', 'Kit', 'Gata2']), t_genes=dict(Ery=['Hbb-bt'], Meg=['Pf4']))
ck_2 = ck_workflow(adata=adata_i.copy(), suffix='_prog_cpg', compute=False)

In [None]:
warnings.simplefilter("ignore")
sc.set_figure_params(figsize=(2.5, 5), dpi_save=1200, fontsize=10, color_map='magma', frameon=False)
scv.set_figure_params(figsize=(2.5, 5), dpi_save=1200, fontsize=10, color_map='magma', frameon=False)

In [None]:
scv.pl.velocity_embedding_stream(vk_2.adata, basis='X_umap', color=['leiden_cell_type_main', 'velocity_pseudotime', 'latent_time', 'root_cells', 'end_points', 'msS_RNA', 'msG2M_RNA'], legend_loc='on data', ncols=7, size=100, density=1.5, recompute=True)

In [None]:
scv.pl.umap(pk_2.adata, color=['leiden_cell_type_main', 'ppt_pseudotime', 'pt_cells', 't_score_start', 't_score_meg', 't_score_ery'], legend_loc='on data', ncols=10, size=100)