In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


This script is used to estimate the number of features with strong regulatory action using Pearson correlation in brain data.

In [None]:
from IPython import display
import pandas as pd
import numpy as np
import numpy as np
import numpy.ma as ma
import scipy
import scipy.stats
from scipy.stats import gaussian_kde
from scipy.stats import t

from matplotlib import pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
from matplotlib import cm
from matplotlib.colors import Normalize 
from matplotlib.offsetbox import AnchoredText

from tensorflow import keras
from sklearn.neighbors import KernelDensity

from tensorflow.keras import layers
import tensorflow_probability as tfp
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()

import pickle
from timeit import default_timer as timer
Folder = '/content/drive/MyDrive/'

Load data, estimate Pearson correlation and construct null distribution hypothesis.

In [None]:
yy_acc_pd = pd.read_csv(Folder+'SCRaPL/Real/Data/Acc_atac.csv',',',header=[0],index_col=[0])
yy_exp_pd = pd.read_csv(Folder+'SCRaPL/Real/Data/Rna_atac.csv',',',header=[0],index_col=[0])
Norm = pd.read_csv(Folder+'SCRaPL/Real/Data/Nrm_atac.csv',',',index_col=[1])
Norm = Norm.drop(columns=Norm.columns[0])

In [None]:
tt1 = yy_acc_pd.div(Norm['nrm_acc']).to_numpy()
tt2 = yy_exp_pd.div(Norm['nrm_rna']).to_numpy()
corr = np.corrcoef(tt1,tt2)
crr_prs = np.diag(corr[:4249,4249:])
num_obs = tt1.shape[1]

In [None]:
#Taken from https://github.com/CoBiG2/cobig_misc_scripts/blob/master/FDR.py
#Author: Francisco Pina Martins <f.pinamartins@gmail.com>
#Taken from https://stackoverflow.com/a/21739593/3091595

def multiple_testing_correction(pvalues, correction_type="FDR"):

    from numpy import array, empty
    pvalues = array(pvalues)
    sample_size = pvalues.shape[0]
    qvalues = empty(sample_size)
    if correction_type == "Bonferroni":
        # Bonferroni correction
        qvalues = sample_size * pvalues
    elif correction_type == "Bonferroni-Holm":
        # Bonferroni-Holm correction
        values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
        values.sort()
        for rank, vals in enumerate(values):
            pvalue, i = vals
            qvalues[i] = (sample_size-rank) * pvalue
    elif correction_type == "FDR":
        # Benjamini-Hochberg, AKA - FDR test
        values = [(pvalue, i) for i, pvalue in enumerate(pvalues)]
        values.sort()
        values.reverse()
        new_values = []
        for i, vals in enumerate(values):
            rank = sample_size - i
            pvalue, index = vals
            new_values.append((sample_size/rank) * pvalue)
        for i in range(0, int(sample_size)-1):
            if new_values[i] < new_values[i+1]:
                new_values[i+1] = new_values[i]
        for i, vals in enumerate(values):
            pvalue, index = vals
            qvalues[index] = new_values[i]
    return qvalues


In [None]:
def integral(y, x,axis=0):
    if axis == 0:
        dx = (x[-1,:] - x[0,:]) / (int(x.shape[0]) - 1)
        num_int = np.multiply((y[0,:] + y[-1,:])/2+np.sum(y[1:-1,:],axis=0) , dx)
    else: 
        dx = (x[:,-1] - x[:,0]) / (int(x.shape[1]) - 1)
        num_int = np.multiply((y[:,0] + y[:,-1])/2+np.sum(y[:,1:-1],axis=1) , dx)
    return  num_int

#Estimate null hypothesis distribution for Pearson.
def null_dist(df,null_thrs):
      r = np.linspace(-0.999,0.999,num=1999,endpoint=True)
      rho = np.linspace(-null_thrs-0.001,null_thrs+0.001,num=1000,endpoint=True)
      r,rho = np.meshgrid(r,rho)
      z_nrm = np.log(df-2) +np.math.lgamma(df-1)-0.5*np.log(2*np.math.pi)-np.math.lgamma(df-0.5)
      z = 0.5*(df-1)*np.log(1-np.square(rho))+0.5*(df-4)*np.log(1-np.square(r))-(df-1.5)*np.log(1-np.multiply(rho,r))
      f = np.exp(z+z_nrm)
      ff = scipy.special.hyp2f1(0.5,0.5,0.5*(2*df-1),0.5*(1+np.multiply(r,rho)))
      p = np.multiply(f,ff)
      p = integral(p[np.abs(rho[:,0])<=null_thrs,:],rho[np.abs(rho[:,0])<=null_thrs,:],0)
      p = p/integral(p[:,np.newaxis],r[0,:][:,np.newaxis],0)
      return p,r[0,:]

In [None]:
t= np.ones(4249)
null_thrs = 0.145

for ii in range(t.shape[0]): 
    p_a,r=null_dist(num_obs,null_thrs)
    thrs=-abs(crr_prs[ii])
    sp = r[r<=thrs]
    t[ii] = 2*integral(p_a[r<=thrs][:,np.newaxis],sp[:,np.newaxis],0)


In [None]:
p_adj = multiple_testing_correction(t, correction_type = "FDR")

In [None]:
np.savetxt(Folder+'SCRaPL/Real/Paper_Analysis/Pearson_meta/brain_p_value.csv', t, delimiter=",")
np.savetxt(Folder+'SCRaPL/Real/Paper_Analysis/Pearson_meta/brain_p_value_adj.csv', p_adj, delimiter=",")