# RNA-velocity MURK gene identification 

[Barile et al. 2021](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02414-y) showed that the transcription kinetics along erythroid development are not constant and hence violating the assumption of scVelo. We too expect multiple rate kinetics (MURK) genes between treatment conditions and adapt the [procedure](https://github.com/mebarile/Gata1_Erythroid_kinetics) to identify them. 

In [188]:
import scvelo as scv
import numpy as np
import pandas as pd
import scipy
import scanpy as sc

import os

In [189]:
sc.settings.vector_friendly = False
scv.set_figure_params( dpi=300, dpi_save = 300, frameon=False, figsize = (7,4), format='png',fontsize=25)

In [190]:
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')

# Import data 

In [191]:
adata = sc.read_h5ad('data/object/velocyto.h5ad')
obs = pd.read_csv('data/object/int/meta/meta.csv', index_col=0)
obsm = pd.read_csv('data/object/int/reductions/X_umap/reduction.csv', index_col=0)

# Filter velocity matrix by cell types from meta 

In [192]:
# Filter obs by Ery annotation 
population_names = ['Ery (1)', 'Ery (2)', 'Ery (3)', 'Ery (4)', 'Ery (5)', 'Ery (6)']
obs = obs[obs['leiden_annotation'].isin(population_names)]

# Filter obsm by cell index
obsm = obsm[obsm.index.isin(obs.index)]

In [193]:
# Filter velocity adata by obs 
adata = adata[adata.obs.index.isin(obs.index)]

In [194]:
# Order index to match velocity adata 
obs = obs.reindex(adata.obs.index)
obsm = obsm.reindex(adata.obs.index)

adata.obs = obs
adata.obsm['X_umap'] = obsm

In [None]:
adata_temp = adata.copy()

# Highly variable genes

In [None]:
def hvg_select(subset, adata):
    
    # HVG on all data 
    adata = adata[adata.obs['treatment']==subset]
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, min_mean=0.05, min_disp=0.1)
    hvg_1 = adata.var_names[adata.var.highly_variable].tolist()

    # HVG on scvelo filtered data
    adata = adata_temp.copy()
    adata = adata[adata.obs['treatment']==subset]
    scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)

    hvg_2 = adata.var_names.tolist()

    # Combine HVG
    hvg = list(set(hvg_1) | set(hvg_2))
    
    # Reset adata
    adata = adata_temp.copy()
    
    return(hvg)

In [None]:
hvg_nacl = hvg_select("NaCl", adata)
hvg_cpg = hvg_select("CpG", adata)

In [None]:
hvg = list(set(hvg_nacl) & set(hvg_cpg))

# Moments imputation 

In [None]:
adata = adata_temp.copy()
adata = adata[:,hvg]

In [None]:
scv.pp.filter_and_normalize(adata)
scv.pp.moments(adata)

In [None]:
# initialize matrix for spliced (mat_s) and unspliced (unspliced) counts
mat_s = pd.DataFrame(adata.layers['Ms'].copy(), index=adata.obs_names, columns=adata.var_names)
mat_u = pd.DataFrame(adata.layers['Mu'].copy(), index=adata.obs_names, columns=adata.var_names)

In [None]:
# initialize dataframes for slopes (df_sl) and slope error (df_ds)
df_sl = pd.DataFrame(index=adata.var_names)
df_ds = pd.DataFrame(index=adata.var_names)

In [None]:
# compute slopes and errors
for pop in population_names:

    vec = adata.obs['leiden_annotation'] == pop
    
    x = mat_s.loc[vec]
    y = mat_u.loc[vec]
    
    mx = np.mean(x)
    my = np.mean(y)
    x_mx = x - mx
    y_my = y - my
    xy = np.sum(x_mx * y_my, axis=0)
    X2 = np.sum(x_mx**2, axis=0)
    sl = xy / X2
    inter = my - sl * mx
    n = np.sum(vec)
                
    df_sl[pop] = pd.DataFrame(sl)     
    
    pred = sl * x + inter
    sse = (pred - y)**2
    SSE = np.sum(sse, axis=0)
                
    ds = np.sqrt(SSE/(n-2)/X2)
    df_ds[pop] = pd.DataFrame(ds)

In [None]:
# initialize dataframes for slopes (df_exp), expression error (df_de) and 95% quantile (df_quant)
df_exp = pd.DataFrame(index=adata.var_names)
df_de = pd.DataFrame(index=adata.var_names)
df_quan = pd.DataFrame(index=adata.var_names)

In [None]:
# compute average expression and its error
for pop in population_names:

    vec = adata.obs['leiden_annotation'] == pop
    n = np.sum(vec)
    expr = np.mean(mat_s.loc[vec], axis = 0)
    dex = np.std(mat_s.loc[vec], axis = 0)/np.sqrt(n)
    
    df_exp[pop] = pd.DataFrame(expr)
    df_de[pop] = pd.DataFrame(dex)

for pop in population_names:
    vec = adata.obs['leiden_annotation'] == pop
    n = np.sum(vec)
    
    df_quan[pop] = scipy.stats.t.ppf(0.95, n - 1)

In [None]:
# compute minimum and maximum estimates of the slopes
df_conf = df_ds * df_quan

df_min = df_sl - df_conf
df_max = df_sl + df_conf

In [None]:
population_names = ['Ery (1)', 'Ery (2)', 'Ery (3)', 'Ery (4)', 'Ery (5)', 'Ery (6)']

In [None]:
# decide if a change is slope is significative (Erythroid 3 expressed more than the others, its minimum slope grater than the previous maximum slope and positive)
vec_mean = (((df_exp['Ery (6)'] >  df_exp['Ery (5)'])
            &(df_exp['Ery (6)'] >  df_exp['Ery (4)']))
            &(df_exp['Ery (6)'] >  df_exp['Ery (3)'])
            &(df_exp['Ery (6)'] >  df_exp['Ery (2)'])
            &(df_exp['Ery (6)'] >  df_exp['Ery (1)']))

vec_sl = (df_sl['Ery (6)'] > df_sl['Ery (5)']) 

vec_pos =  df_sl['Ery (6)']>0

vec_test = df_min['Ery (6)'] > df_max['Ery (5)']

vec_tot = ((np.array(vec_sl) & np.array(vec_mean) & np.array(vec_pos) & np.array(vec_test)) | 
 (vec_mean & np.array(df_sl['Ery (6)']<0)) )

In [None]:
# How many MURK genes
np.sum(vec_tot)

In [None]:
len(murk_genes)

In [None]:
# MURK genes for GO analysis Figure 3D
murk_genes = adata.var_names[vec_tot]
pd.DataFrame(murk_genes).to_csv('result/scvelo/resultmurk_genes_mouse.csv', index=None,header=None)

In [None]:
# recalculate the slopes scaling for the avarage of gene expression in order to range the genes for "MURKiness"

df_sl2 = pd.DataFrame(index=adata.var_names)

for pop in population_names:
    print(pop)
    vec = adata.obs['leiden_annotation'] == pop
    
    x = mat_s.loc[vec]/np.max(mat_s.loc[vec])
    y = mat_u.loc[vec]/np.max(mat_u.loc[vec])
    
    mx = np.mean(x)
    my = np.mean(y)
    x_mx = x - mx
    y_my = y - my
    xy = np.abs(np.sum(x_mx * y_my, axis = 0))
    X2 = np.sum(x_mx**2,axis = 0)
    sl = xy / X2

    df_sl2[pop] = pd.DataFrame(sl)

In [None]:
test = pd.read_csv('murk_genes_mouse.csv', header=0, index_col=0).index

In [None]:
len(test)

In [None]:
len(murk_genes)

In [None]:
murk_genes[murk_genes.isin(test)]

In [None]:
murk_genes