In [1]:
import math
import numpy as np
import os
import pandas as pd
import subprocess

"""
Correcting for multiple testing of ANOVA p.values
We restrict to eSTRs exclusively because we only care  
to compare between eSTRs and close by eSNPs

"""
DATADIR = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/'
SHORTEN = {
    "Artery-Aorta":"Artery.A"     ,
    "Artery-Tibial": "Artery.T",
    "Adipose-Subcutaneous": "Adipose.S",    
    "Adipose-Visceral":"Adipose.V",
    "Brain-Caudate(basalganglia)":"Caudate"   , 
    "Brain-Cerebellum":"Cerebellum",
    "Cells-Transformedfibroblasts": "Fibroblast",
    "Esophagus-Mucosa": "Mucosa",
    "Esophagus-Muscularis":"Muscularis",
    "Heart-LeftVentricle":"Ventricule",
    "Lung": "Lung",
    "Muscle-Skeletal": "Muscle",
    "Nerve-Tibial":"Nerve.T",
    "Skin-NotSunExposed(Suprapubic)": "SkinUnexposed",
    "Skin-SunExposed(Lowerleg)":"SkinLeg",
    "Thyroid":"Thyroid",
    "WholeBlood": "Blood"
}
tissues = sorted([item for item in list(SHORTEN.keys())])

In [14]:
T = tissues[0]

for T in tissues:
    
    print(T)
    
    linreg = pd.read_csv(os.path.join(DATADIR, T, "PQValues"), sep="\t")
    linreg['id']=linreg['gene']+"_"+linreg['str.start'].astype(str)
    estrs=linreg.loc[linreg['qvalue']<0.1]

    anova=pd.read_csv(os.path.join(DATADIR, T,'HH/anova_wg.csv') )
    anova['id']=anova['gene']+"_"+anova['str.start'].astype(str)

    to_correct = anova.loc[anova['id'].isin(list(linreg['id']))].copy().merge(linreg[['id','qvalue']], on='id')
    print(to_correct.shape)

    to_correct = to_correct.loc[to_correct['qvalue']<0.1].copy()
    to_correct = to_correct[np.isfinite(to_correct['anova_pval'])].copy()
    print(to_correct.shape, '*- after NA values out... ',linreg.shape,estrs.shape, anova.shape, to_correct.loc[to_correct['anova_pval']<0.05].shape[0]*100/estrs.shape[0])
    #FDR correction
    print("Number of eSTRs tested against added SNP models", to_correct.shape[0])
    to_correct['anova_pval'].astype(float).to_csv('pvalues.txt', sep='\n', index=False)
    Tell = subprocess.call("./fdr_correct.r") #copied the file here
    #Add qvalues to dataframe
    Qval=pd.read_csv('qvalues.txt', sep=' ')
    to_correct['anova_qval']=list(Qval['V1.1'])
    to_correct['significant']=list(Qval['significant'])
    print(sum(to_correct['significant']), 'significant qvalues at 5% FDR', to_correct.loc[to_correct['anova_qval']<=0.05].shape)    
    #Verify Delta AIC abd delta BIC
    aic_pass = to_correct.loc[(to_correct['significant']==1 )& (to_correct['delta_aic']>0)]
    bic_pass = to_correct.loc[(to_correct['significant']==1 )& (to_correct['delta_bic']>0)]
    PASS = to_correct.loc[(to_correct['significant']==1 )& (to_correct['delta_aic']>0)& (to_correct['delta_bic']>0)]
    print("Out of ",aic_pass.shape[0],'AIC and ', bic_pass.shape[0],'BIC,', PASS.shape[0],' passed ANOVA successfully (', PASS.shape[0]*100/estrs.shape[0],')')    
    tosave=anova.merge(to_correct[['id','anova_qval']], on='id', how='outer').drop( ['id'], axis=1)
    print(tosave.loc[tosave['anova_qval']<0.05].shape)
    print('\n-------------------------------')
    command='rm /storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/'+T+'/HH/Adusted_anova_wg.csv'
    o=os.command(command)
    tosave.to_csv(os.path.join(DATADIR, T,'HH/Adjusted_anova_wg.csv'), sep='\t', index=None)
    #break

Adipose-Subcutaneous
(4424, 15)
(1697, 15) *- after NA values out...  (87891, 13) (1789, 13) (4424, 14) 32.92342090553382
Number of eSTRs tested against added SNP models 1697
514 significant qvalues at 5% FDR (514, 17)
Out of  514 AIC and  457 BIC, 457  passed ANOVA successfully ( 25.544997205142536 )
(514, 14)

-------------------------------
Adipose-Visceral
(1936, 15)
(821, 15) *- after NA values out...  (87912, 13) (888, 13) (1936, 14) 30.18018018018018
Number of eSTRs tested against added SNP models 821
193 significant qvalues at 5% FDR (193, 17)
Out of  193 AIC and  193 BIC, 193  passed ANOVA successfully ( 21.734234234234233 )
(193, 14)

-------------------------------
Artery-Aorta
(2517, 15)
(1055, 15) *- after NA values out...  (86353, 13) (1139, 13) (2517, 14) 29.41176470588235
Number of eSTRs tested against added SNP models 1055
261 significant qvalues at 5% FDR (261, 17)
Out of  261 AIC and  259 BIC, 259  passed ANOVA successfully ( 22.739244951712028 )
(261, 14)

---------