In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Feature detection with SCRaPL, Spearman and Pearson.

In [None]:
from IPython import display
import pandas as pd
import numpy as np
import numpy.ma as ma
import scipy
import scipy.stats
from scipy.stats import gaussian_kde
from scipy.stats import t

from matplotlib import pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
from matplotlib import cm
from matplotlib.colors import Normalize 
from matplotlib.offsetbox import AnchoredText

from sklearn.neighbors import KernelDensity

import tensorflow_probability as tfp
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()

import pickle
from timeit import default_timer as timer
Folder = '/content/drive/MyDrive/SCRaPL/'
Techn = '10X'


Required Functions

In [None]:
#Numerical integral estimation
def integral(y, x,axis=0):
    if axis == 0:
        dx = (x[-1,:] - x[0,:]) / (int(x.shape[0]) - 1)
        num_int = np.multiply((y[0,:] + y[-1,:])/2+np.sum(y[1:-1,:],axis=0) , dx)
    else: 
        dx = (x[:,-1] - x[:,0]) / (int(x.shape[1]) - 1)
        num_int = np.multiply((y[:,0] + y[:,-1])/2+np.sum(y[:,1:-1],axis=1) , dx)
    return  num_int

#SCRaPL Feature Detection
def ft_detect(alpha,prob_bay):

      p_c = 1-prob_bay
      zz1 = tf.greater(p_c,alpha)
      zz2 = tf.less_equal(p_c,alpha)  

      EFDR = tf.reduce_mean(tf.gather(prob_bay,tf.where(zz1==True)))
      num_fts = tf.shape(tf.where(zz1==True))[0]
      fts_ind = tf.cast(zz1,dtype=tf.int16)

      return num_fts,EFDR,fts_ind

#Multiple hypothesis testing correction.  (Used for Spearman and Pearson)
#Taken from https://github.com/CoBiG2/cobig_misc_scripts/blob/master/FDR.py
#Author: Francisco Pina Martins <f.pinamartins@gmail.com>
#Taken from https://stackoverflow.com/a/21739593/3091595
def multiple_testing_correction(pvalues, correction_type="FDR"):

    from numpy import array, empty
    pvalues = array(pvalues)
    sample_size = pvalues.shape[0]
    qvalues = empty(sample_size)
    if correction_type == "Bonferroni":
        # Bonferroni correction
        qvalues = sample_size * pvalues
    elif correction_type == "Bonferroni-Holm":
        # Bonferroni-Holm correction
        values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
        values.sort()
        for rank, vals in enumerate(values):
            pvalue, i = vals
            qvalues[i] = (sample_size-rank) * pvalue
    elif correction_type == "FDR":
        # Benjamini-Hochberg, AKA - FDR test
        values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
        values.sort()
        values.reverse()
        new_values = []
        for i, vals in enumerate(values):
            rank = sample_size - i
            pvalue, index = vals
            new_values.append((sample_size/rank) * pvalue)
        for i in range(0, int(sample_size)-1):
            if new_values[i] < new_values[i+1]:
                new_values[i+1] = new_values[i]
        for i, vals in enumerate(values):
            pvalue, index = vals
            qvalues[index] = new_values[i]
    return qvalues

#Estimate null hypothesis distribution for Pearson/Spearman. (Supplementary Materials S9.1)
def null_dist(df,null_thrs):
      r = np.linspace(-0.999,0.999,num=499,endpoint=True)
      rho = np.linspace(-null_thrs-0.001,null_thrs+0.001,num=1000,endpoint=True)
      r,rho = np.meshgrid(r,rho)
      z_nrm = np.log(df-2) +np.math.lgamma(df-1)-0.5*np.log(2*np.math.pi)-np.math.lgamma(df-0.5)
      z = 0.5*(df-1)*np.log(1-np.square(rho))+0.5*(df-4)*np.log(1-np.square(r))-(df-1.5)*np.log(1-np.multiply(rho,r))
      f = np.exp(z+z_nrm)
      ff = scipy.special.hyp2f1(0.5,0.5,0.5*(2*df-1),0.5*(1+np.multiply(r,rho)))
      p = np.multiply(f,ff)
      p = integral(p[np.abs(rho[:,0])<=null_thrs,:],rho[np.abs(rho[:,0])<=null_thrs,:],0)
      p = p/integral(p[:,np.newaxis],r[0,:][:,np.newaxis],0)
      return p,r[0,:]

#Pearson correlation ignoring readings with zeros methylation coverage
def nancorrcoef(x,y,num_obs):

      tt1 = np.divide(x-np.nanmean(x,axis=1)[:,np.newaxis],np.nanstd(x,axis=1,ddof=1)[:,np.newaxis])
      tt2 = np.divide(y-np.nanmean(y,axis=1)[:,np.newaxis],np.nanstd(y,axis=1,ddof=1)[:,np.newaxis])

      crr = np.divide(np.nansum(np.multiply(tt1,tt2),axis=1)[:,np.newaxis],num_obs-1)
      return crr

Load Data

In [None]:
if Techn == 'NMT':

    with open(Folder+'Demo/Data/yy_met_300_100_'+Techn+'.pickle','rb') as handle: 
        yy_met = pickle.load(handle)
    with open(Folder+'Demo/Data/yy_exp_300_100_'+Techn+'.pickle','rb') as handle: 
        yy_exp = pickle.load(handle)
    with open(Folder+'Demo/Data/yy_cpg_300_100_'+Techn+'.pickle','rb') as handle: 
        CpG = pickle.load(handle)
    with open(Folder+'Demo/Data/Norm_300_100_'+Techn+'.pickle','rb') as handle: 
        nrm = pickle.load(handle)
    
    n_genes = yy_exp.shape[0]
    yy_met_nrm = tf.divide(yy_met,CpG).numpy()
    yy_exp_nrm = tf.divide(yy_exp,nrm).numpy()
    CpG_np = CpG.numpy()
    
    CpG_np[CpG_np ==0] = np.nan
    yy_met_nrm[np.isnan(CpG_np)] = np.nan
    yy_exp_nrm[np.isnan(CpG_np)] = np.nan 
    num_obs = np.sum(~np.isnan(CpG_np),axis=1)[:,np.newaxis]

elif Techn == '10X':

    with open(Folder+'Demo/Data/yy_acc_300_100_'+Techn+'.pickle','rb') as handle: 
        yy_acc = pickle.load(handle)
    with open(Folder+'Demo/Data/yy_exp_300_100_'+Techn+'.pickle','rb') as handle: 
        yy_exp = pickle.load(handle)
    with open(Folder+'Demo/Data/Norm_acc_300_100_'+Techn+'.pickle','rb') as handle: 
        nrm_acc = pickle.load(handle)
    with open(Folder+'Demo/Data/Norm_exp_300_100_'+Techn+'.pickle','rb') as handle: 
        nrm_exp = pickle.load(handle)

    n_genes = yy_exp.shape[0]
    yy_acc_nrm = tf.divide(yy_acc,nrm_acc).numpy()
    yy_exp_nrm = tf.divide(yy_exp,nrm_exp).numpy()

else:
        
    print('Please choose a correct input')

with open(Folder+'Demo/Results/nuts_cor_300_100_'+Techn+'.pickle', 'rb') as handle:
    cor = pickle.load(handle).numpy()


SCRaPL Freature Detection

In [None]:
#Kernel Desnity Estimation for posterior correlation samples
x_support = np.linspace(-3, 3, 400)
kde_obj= np.apply_along_axis(lambda x: KernelDensity(bandwidth=0.1, kernel='gaussian').fit(x_support[:,None]),0, cor)

#Estimate tail probability
p_bay  = np.zeros(n_genes)
gam = .115
for ii in range(n_genes):
    density =  np.exp(kde_obj[ii].score_samples(x_support[:,None]))
    p_bay[ii] =  integral(density[None,np.abs(x_support)<2*np.math.atanh(gam)], x_support[None,np.abs(x_support)<2*np.math.atanh(gam)],axis=1)

#Grid search for optimal alpha value estimation and feature detection.
alpha = np.linspace(0.0, 1.0, 201)
for ii in range(len(alpha)):
    num_features,EFDR,ft_ind = ft_detect(alpha[ii],p_bay)
    if EFDR<0.1:
          summary = tf.stack([num_features.numpy(),EFDR.numpy(),alpha[ii]]).numpy()
          print(summary)
          break
    else:
          print("No significant feature detected")

[3.00e+02 7.25e-02 0.00e+00]


Frequentist Feature Detection

In [None]:
t_prs = np.ones(n_genes)
t_spr = np.ones(n_genes)

if Techn =='10X':

      cor_prs = np.diag(np.corrcoef(yy_acc_nrm,yy_exp_nrm)[:n_genes,n_genes:] )
      cor_spr, _ = scipy.stats.spearmanr(yy_acc_nrm,yy_exp_nrm,axis=1)
      cor_spr =  np.diag(cor_spr[:n_genes,n_genes:])

      p_a,r=null_dist(n_genes-2,gam)
      for ii in range(n_genes): 

            
            thrs_spr=-abs(cor_spr[ii])
            thrs_prs=-abs(cor_prs[ii])

            sp_spr = r[r<=thrs_spr]
            sp_prs = r[r<=thrs_prs]

            t_spr[ii] = 2*integral(p_a[r<=thrs_spr][:,np.newaxis],sp_spr[:,np.newaxis],0)
            t_prs[ii] = 2*integral(p_a[r<=thrs_prs][:,np.newaxis],sp_prs[:,np.newaxis],0)

      p_adj_prs = multiple_testing_correction(t_prs, correction_type = "FDR")
      p_adj_spr = multiple_testing_correction(t_spr, correction_type = "FDR")

      print("Prs_det_ft:{},Spr_det_ft:{}".format(np.sum(p_adj_prs<0.1),np.sum(p_adj_spr<0.1)))

else:

      cor_prs = nancorrcoef(yy_met_nrm,yy_exp_nrm,num_obs)
      cor_spr = np.zeros(n_genes)
      for ii in range(n_genes):
          cor_spr[ii] , _ = scipy.stats.spearmanr(yy_met_nrm[ii,:],yy_exp_nrm[ii,:],nan_policy='omit')

     
      xx = np.apply_along_axis(lambda x: null_dist(x-2,gam),1, num_obs)


      for ii in range(n_genes): 

          p_a = np.squeeze(xx[ii,0,:])
          r = np.squeeze(xx[ii,1,:])
          thrs_spr=-abs(cor_spr[ii])
          thrs_prs=-abs(cor_prs[ii])

          sp_spr = r[r<=thrs_spr]
          sp_prs = r[r<=thrs_prs]

          t_spr[ii] = 2*integral(p_a[r<=thrs_spr][:,np.newaxis],sp_spr[:,np.newaxis],0)
          t_prs[ii] = 2*integral(p_a[r<=thrs_prs][:,np.newaxis],sp_prs[:,np.newaxis],0)

      p_adj_prs = multiple_testing_correction(t_prs, correction_type = "FDR")
      p_adj_spr = multiple_testing_correction(t_spr, correction_type = "FDR")

      print("Prs_det_ft:{},Spr_det_ft:{}".format(np.sum(p_adj_prs<0.1),np.sum(p_adj_spr<0.1)))

Prs_det_ft:23,Spr_det_ft:24
