# Compute Kernel for CellRank 

**RNA velocity** [Bergen et al., 2021](https://www.embopress.org/doi/full/10.15252/msb.202110282)  
Beyond the scope of computational modeling, the statistical power of the methods depends on the curvature in the phase portrait since a lack of curvature challenges current models to distinguish whether an up- or down-regulation is occurring. The overall curvature of deviation from the steady-state line in the phase portrait is mostly impacted by the ratios of splicing to degradation rates (Box 1), **indicating that statistical inference is limited to genes where splicing is faster or comparable to degradation, while a small ratio would yield straight lines rather than an interpretable curvature.**  

**MURK genes** [GitHub](https://github.com/mebarile/Gata1_Erythroid_kinetics/blob/main/Figure_6/search_and_rank_murk_human.ipynb)

In [None]:
import scvelo as scv
import scanpy as sc
import scanpy.external as sce

import scipy
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import os

In [None]:
# rpy2 
os.environ['R_HOME'] = '/nobackup/peer/fdeckert/miniconda3/envs/r.4.1.0/lib/R'

In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
sc.set_figure_params(figsize=(5, 5), dpi_save=1200, fontsize=10, frameon=False, color_map='magma')
scv.set_figure_params(figsize=(5, 5), dpi_save=1200, fontsize=10, frameon=False, color_map='magma')

In [None]:
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')

In [None]:
# Plotting 
import rpy2.robjects as robjects
color_load = robjects.r.source('plotting_global.R')
color = dict()
for i in range(len(color_load[0])):
    color[color_load[0].names[i]] = {key : color_load[0][i].rx2(key)[0] for key in color_load[0][i].names}

In [None]:
def set_color(adata, categories=color.keys()): 
    
    categories = [x for x in categories if x in list(adata.obs.columns)]
    for category in categories: 
        
        adata.obs[category] = pd.Series(adata.obs[category], dtype='category')
        
        keys = list(color[category].keys())
        keys = [x for x in keys if x in list(adata.obs[category])]

        adata.obs[category] = adata.obs[category].cat.reorder_categories(keys)
        adata.uns[category+'_colors'] = np.array([color[category].get(key) for key in keys], dtype=object)

# Import data 

In [None]:
# Adata 
adata = sc.read_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/pp.h5ad')
adata_v = sc.read_h5ad('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/velocyto.h5ad')

# Transfer velocyto matrices 

In [None]:
obs_names = adata.obs_names
var_names = adata.var_names.intersection(adata_v.var_names)

In [None]:
adata = adata[:, var_names]
adata_v = adata_v[obs_names, var_names]

In [None]:
adata.X = adata_v.X
adata.layers = adata_v.layers

In [None]:
# Set color
set_color(adata)

In [None]:
# UMAP 
sc.pl.umap(sc.pp.normalize_total(adata.raw.to_adata(), copy=True), color=['leiden_cell_type_main', 'Cd34', 'Ly6a', 'Procr', 'Kit', 'Spi1', 'Klf1', 'Gata1', 'Gata2', 'Nfe2', 'Fli1', 'Pf4', 'Mpl', 'Gfi1', 'Cebpa', 'Irf8', 'Elane', 'Mcpt8', 'Prss34', 'Cd74', 'Csf1r', 'Car2', 'Dntt', 'Flt3'], legend_loc='on data', ncols=5, size=50, use_raw=False)

# Helper function

In [None]:
#######################
### Velocity kernel ###
#######################
def vk_workflow(adata, suffix, gene_subset=None, compute=False):
    
    if compute: 
        
        if gene_subset is None: 
            # Subset genes 
            adata = adata[:, (adata.X>=1).sum(axis=0)>=1].copy()

            # Set filtered var names 
            var_names = scv.pp.filter_and_normalize(adata, min_shared_counts=20, subset_highly_variable=False, n_top_genes=None, log=False, copy=True).var_names

            # Normalize X, spliced, and unspliced
            sc.pp.normalize_total(adata, layers=['spliced', 'unspliced'])

            # Subset by var names filter
            adata = adata[:, var_names].copy()

            # First-/second-order moments are computed for each cell across its nearest neighbors
            scv.pp.moments(adata, n_pcs=None, n_neighbors=None)
            scv.tl.recover_dynamics(adata, n_jobs=48)

            # Estimates velocities in a gene-specific manner
            scv.tl.velocity(adata, mode='dynamical')
            
            # Velocity graph 
            scv.tl.velocity_graph(adata, n_jobs=48)

            # latent time
            scv.tl.latent_time(adata, min_likelihood=0.1)
            scv.tl.velocity_confidence(adata)
            
        else: 
            
            # Velocity graph 
            scv.tl.velocity_graph(adata, n_jobs=48, gene_subset=gene_subset)

            # latent time
            scv.tl.latent_time(adata, min_likelihood=0.1)
            scv.tl.velocity_confidence(adata)
        
        # CellRank kernel 
        from cellrank.kernels import VelocityKernel
        vk = VelocityKernel(adata)
        
        vk.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/vk'+suffix+'.pickle', write_adata=True)
        
    else: 
        
        from cellrank.kernels import VelocityKernel
        vk = VelocityKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/vk'+suffix+'.pickle')
        
    return vk

In [None]:
##################
### MURK genes ###
##################
def murk_workflow(adata): 
    
    # initialize matrix for spliced (mat_s) and unspliced (unspliced) counts
    mat_s = pd.DataFrame(adata.layers['Ms'].copy())
    mat_u = pd.DataFrame(adata.layers['Mu'].copy())
    mat_s.index = adata.obs_names
    mat_s.columns = adata.var_names
    mat_u.index = adata.obs_names
    mat_u.columns = adata.var_names
    
    # initialize dataframes for slopes (df_sl) and slope error (df_ds)
    df_sl = pd.DataFrame(index=adata.var_names)
    df_db = pd.DataFrame(index=adata.var_names)
    
    for pop in ['Meg', 'MEP', 'ProEB', 'EB I', 'EB II', 'EB III']:

        vec = adata.obs['leiden_cell_type_main'] == pop

        x = mat_s.loc[vec]
        y = mat_u.loc[vec]

        mx = np.mean(x)
        my = np.mean(y)
        x_mx = x - mx
        y_my = y - my
        xy = np.sum(x_mx * y_my, axis = 0)
        X2 = np.sum(x_mx**2,axis = 0)
        sl = xy / X2
        inter = my - sl * mx
        n = np.sum(vec)

        df_sl[pop] = pd.DataFrame(sl)


        pred = sl * x + inter
        sse = (pred - y)**2
        SSE = np.sum(sse, axis = 0)

        db = np.sqrt(SSE/(n-2)/X2)
        df_db[pop] = pd.DataFrame(db)

        dq = np.sqrt(SSE * (1/n + mx**2/X2)/ (n-2))
        
    # initialize dataframes for expression (df_exp), expression error (df_de), unspliced counts (df_unsp) 95% quantile (df_quant)
    df_exp = pd.DataFrame(index=adata.var_names)
    df_de = pd.DataFrame(index=adata.var_names)
    df_quan = pd.DataFrame(index = adata.var_names)
    df_unsp = pd.DataFrame(index=adata.var_names)
    
    # compute average expression and its error
    for pop in ['Meg', 'MEP', 'ProEB', 'EB I', 'EB II', 'EB III']:

        vec = adata.obs['leiden_cell_type_main'] == pop
        n = np.sum(vec)
        expr = np.mean(mat_s.loc[vec],axis =0)
        dex = np.std(mat_s.loc[vec],axis =0)/np.sqrt(n)

        df_exp[pop] = pd.DataFrame(expr)
        df_de[pop] = pd.DataFrame(dex)



    for pop in ['Meg', 'MEP', 'ProEB', 'EB I', 'EB II', 'EB III']:

        vec = adata.obs['leiden_cell_type_main'] == pop
        n = np.sum(vec)
        unsp = np.mean(mat_u.loc[vec],axis =0)

        df_unsp[pop] = pd.DataFrame(unsp)
        
    # compute average expression and its error
    for pop in ['Meg', 'MEP', 'ProEB', 'EB I', 'EB II', 'EB III']:
        vec = adata.obs['leiden_cell_type_main'] == pop
        n = np.sum(vec)

        df_quan[pop] = scipy.stats.t.ppf(0.95, n - 1)

    df_conf = df_db * df_quan
    df_min = df_sl - df_conf
    df_max = df_sl + df_conf
    
    # decide if a change is slope is significative (Erythroid 3 expressed more than the others, its minimum slope grater than the previous maximum slope and positive)
    vec_mean = (
        
        (df_exp['EB III'] > df_exp['EB II']) &
        (df_exp['EB III'] > df_exp['EB I']) &
        (df_exp['EB III'] > df_exp['ProEB']) & 
        (df_exp['EB III'] > df_exp['MEP']) 
    )

    vec_uns = (
        
        (df_unsp['EB III'] > df_unsp['EB II']) &
        (df_unsp['EB III'] > df_unsp['EB I']) & 
        (df_unsp['EB III'] > df_unsp['ProEB'])
    
    )

    vec_pos =  df_min['EB III'] > 0

    vec_test = df_min['EB III'] > df_max['EB II']

    vec_tot =  (
        
        (np.array(vec_mean) & np.array(vec_pos) & np.array(vec_test) & np.array(vec_uns)) | (np.array(((df_exp['EB III'] >  df_exp['ProEB']) & (df_exp['EB III'] > df_exp['MEP']))) & np.array(df_min['EB III'] < 0)  & np.array(vec_uns))
    
    )
    
    # Get MURK genes
    murk_genes = adata.var_names[vec_tot]
    
    return(murk_genes)

In [None]:
###########################
### Connectivity kernel ###
###########################
def ck_workflow(adata, suffix, compute=True):
    
    if compute: 
        
        # CellRank kernel 
        from cellrank.kernels import ConnectivityKernel
        ck = ConnectivityKernel(adata)
        
        ck.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/ck'+suffix+'.pickle', write_adata=True)
        
    else: 
        
        from cellrank.kernels import ConnectivityKernel
        ck = ConnectivityKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/ck'+suffix+'.pickle')
        
    return ck

In [None]:
#######################
### Palantir kernel ###
#######################
def pk_workflow(adata, suffix, s_genes, t_genes, compute=True):
    
    if compute:
        
        # Use original data
        adata = adata.raw.to_adata()
        adata.raw = adata
        
        # Subset genes 
        adata = adata[:, (adata.X>=1).sum(axis=0)>=1].copy()
        
        # Normalize
        sc.pp.normalize_total(adata)
        
        # PCA
        sc.pp.pca(adata, n_comps=300)
        
        # Palantir diffusion maps based on PCA
        sce.tl.palantir(adata, n_components=10, knn=30)

        # Compute starting cell
        for k, v in s_genes.items():
            sc.tl.score_genes(adata, v, score_name='t_score_'+ k.lower())
            t_cell=adata.obs['t_score_'+ k.lower()].idxmax()
            s_genes[k] = t_cell
        s_id = pd.Series(list(s_genes.keys()), list(s_genes.values()))

        # Compute terminal state cells
        for k, v in t_genes.items():
            sc.tl.score_genes(adata, v, score_name='t_score_'+ k.lower())
            t_cell=adata.obs['t_score_'+ k.lower()].idxmax()
            t_genes[k] = t_cell
        t_id = pd.Series(list(t_genes.keys()), list(t_genes.values()))

        adata.obs = adata.obs.join(pd.DataFrame({'pt_cells':pd.concat([s_id, t_id]).tolist()}, index=pd.concat([s_id, t_id]).index), how='left')
        
        # Palantir results 
        pr_res = sce.tl.palantir_results(
            adata,
            early_cell=s_id.index[0], 
            terminal_states=t_id, 
            ms_data='X_palantir_multiscale',
            num_waypoints=500, 
            use_early_cell_as_start=False
        )
        
        # Get pseudotime 
        adata.obs['ppt_pseudotime'] = pr_res.pseudotime
        
        # Use original data
        adata = adata.raw.to_adata()
        adata.raw = adata

        # CellRank kernel 
        from cellrank.kernels import PseudotimeKernel
        pk = PseudotimeKernel(adata, time_key='ppt_pseudotime')

        pk.write('data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/pk'+suffix+'.pickle', write_adata=True)
    
    else: 
        
        from cellrank.kernels import PseudotimeKernel
        pk = PseudotimeKernel.read(fname='data/BSA_0355_SM01_10x_SPLENO/ANALYSIS/cellrank/kernel/pk'+suffix+'.pickle')
    
    return pk

# Kernel Progenitor

In [None]:
sc.set_figure_params(figsize=(2.5, 5), dpi_save=1200, fontsize=10, color_map='magma', frameon=False)
scv.set_figure_params(figsize=(2.5, 5), dpi_save=1200, fontsize=10, color_map='magma', frameon=False)

In [None]:
cell_type_prog = [
    
    'Meg', 
    'MEP', 
    'ProEB', 
    'EB I',
    'EB II', 
    'EB III'
    
]

In [None]:
mep_genes = pd.read_csv('result/dea/scRNAseq/wilcox/velocity_mep.csv', index_col=0)['genes']

## Kernel Progenitor (Baseline) 

In [None]:
# Subset anndata 
adata_i = adata.copy()
adata_i = adata_i[(adata_i.obs.leiden_cell_type_main.isin(cell_type_prog)).values & (adata_i.obs.infection=='Baseline').values].copy()

In [None]:
# Neighborgraph
sc.pp.neighbors(adata_i, n_neighbors=30, use_rep='latent')

In [None]:
vk_1 = vk_workflow(adata=adata_i, suffix='_prog_baseline_1', compute=False)
vk_2 = vk_workflow(adata=vk_1.adata, suffix='_prog_baseline_2', compute=False, gene_subset=list(set(vk_1.adata.var_names[vk_1.adata.var.velocity_genes])-set(mep_genes)))
ck = ck_workflow(adata=adata_i, suffix='_prog_baseline', compute=False)
pk = pk_workflow(adata=adata_i, suffix='_prog_baseline', compute=False, s_genes=dict(Start=['Cd34', 'Kit', 'Gata2']), t_genes=dict(Ery=['Hbb-bt'], Meg=['Pf4']))

In [None]:
scv.pl.velocity_embedding_stream(vk_1.adata, basis='X_umap', color=['leiden_cell_type_main', 'latent_time', 'velocity_pseudotime', 'velocity_confidence', 'root_cells', 'end_points'], legend_loc='on data', ncols=6, size=100, use_raw=False)

In [None]:
scv.pl.velocity_embedding_stream(vk_2.adata, basis='X_umap', color=['leiden_cell_type_main', 'latent_time', 'velocity_pseudotime', 'velocity_confidence', 'root_cells', 'end_points'], legend_loc='on data', ncols=6, size=100, use_raw=False)

In [None]:
sc.pl.umap(pk.adata, color=['leiden_cell_type_main', 'ppt_pseudotime', 'pt_cells', 't_score_start', 't_score_ery', 't_score_meg'], legend_loc='on data', ncols=6, size=100, use_raw=False)

## Kernel Progenitor (CpG) 

In [None]:
# Subset anndata 
adata_i = adata.copy()
adata_i = adata_i[(adata_i.obs.leiden_cell_type_main.isin(cell_type_prog)).values & (adata_i.obs.infection=='CpG').values].copy()

In [None]:
# Neighborgraph
sc.pp.neighbors(adata_i, n_neighbors=30, use_rep='latent')

In [None]:
vk_1 = vk_workflow(adata=adata_i, suffix='_prog_cpg_1', compute=False)
vk_2 = vk_workflow(adata=vk_1.adata, suffix='_prog_cpg_2', compute=True, gene_subset=list(set(vk_1.adata.var_names[vk_1.adata.var.velocity_genes])-set(mep_genes)))
ck = ck_workflow(adata=adata_i, suffix='_prog_cpg', compute=False)
pk = pk_workflow(adata=adata_i, suffix='_prog_cpg', compute=False, s_genes=dict(Start=['Cd34', 'Kit', 'Gata2']), t_genes=dict(Ery=['Hbb-bt'], Meg=['Pf4']))

In [None]:
scv.pl.velocity_embedding_stream(vk_1.adata, basis='X_umap', color=['leiden_cell_type_main', 'latent_time', 'velocity_pseudotime', 'velocity_confidence', 'root_cells', 'end_points'], legend_loc='on data', ncols=6, size=100, use_raw=False)

In [None]:
scv.pl.velocity_embedding_stream(vk_2.adata, basis='X_umap', color=['leiden_cell_type_main', 'latent_time', 'velocity_pseudotime', 'velocity_confidence', 'root_cells', 'end_points'], legend_loc='on data', ncols=6, size=100, use_raw=False)

In [None]:
sc.pl.umap(pk.adata, color=['leiden_cell_type_main', 'ppt_pseudotime', 'pt_cells', 't_score_start', 't_score_ery', 't_score_meg'], legend_loc='on data', ncols=6, size=100, use_raw=False)