In [2]:
import math
import numpy as np
import os
import pandas as pd
import subprocess

"""
Correcting for multiple testing of ANOVA p.values
We restrict to eSTRs exclusively because we only care  
to compare between eSTRs and close by eSNPs

"""
DATADIR = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue'
SHORTEN = {
    "Artery-Aorta":"Artery.A"     ,
    "Artery-Tibial": "Artery.T",
    "Adipose-Subcutaneous": "Adipose.S",    
    "Adipose-Visceral":"Adipose.V",
    "Brain-Caudate":"Caudate"   , 
    "Brain-Cerebellum":"Cerebellum",
    "Cells-Transformedfibroblasts": "Fibroblast",
    "Esophagus-Mucosa": "Mucosa",
    "Esophagus-Muscularis":"Muscularis",
    "Heart-LeftVentricle":"Ventricule",
    "Lung": "Lung",
    "Muscle-Skeletal": "Muscle",
    "Nerve-Tibial":"Nerve.T",
    "Skin-NotSunExposed": "SkinUnexposed",
    "Skin-SunExposed":"SkinLeg",
    "Thyroid":"Thyroid",
    "WholeBlood": "Blood"
}
tissues = sorted([item for item in list(SHORTEN.keys())])
header=["chrom","gene","str.start","numsnps","numsamples","r2_str","r2_snp","r2_snpstr","anova_pval","estr_fdr","esnp_fdr","delta_bic","delta_aic"]

In [None]:
#Merge all chromosome and move intermediate files into a new dir
chrom=[i for i in range(1,23,1)]+['X']

for T in tissues:
    path='%s/%s/HH/anova_ch'%(DATADIR,T)
    frames = []
    N=0
    for C in chrom:
        #print(C)
        try:
            frames.append(pd.read_csv('%s%s'%(path,str(C)))[header])
        except:
            N=N+1
    if len(frames)>18: data = pd.concat(frames)
    else: 
        print(T)
        continue
    
    data[header].to_csv('%s/%s/HH/anova_wg.csv'%(DATADIR,T), index=None)
    
    command='mkdir %s/%s/HH/intermediate_files_anova'%(DATADIR, T)
    os.system(command)
    
    command='mv %s/%s/HH/anova_ch* %s/%s/HH/intermediate_files_anova/'%(DATADIR, T,DATADIR,T)
    os.system(command)
    
    print(T, data.shape)
    #break
    

In [3]:
#Then we perform FDR correction
T = tissues[0]

for T in tissues:
    
    print(T)
    anova=pd.read_csv(os.path.join(DATADIR, T,'HH/anova_wg.csv') )
    anova['id']=anova['gene']+'_'+anova['str.start'].astype(int).astype(str)
    linreg = pd.read_csv(os.path.join(DATADIR, T,'PQValues'),sep='\t')
    linreg = linreg.loc[linreg['qvalue']<0.1].copy()
    linreg['id']=linreg['gene']+'_'+linreg['str.start'].astype(int).astype(str) 
    print(linreg.shape, anova.shape)

    to_correct = anova.merge(linreg[['id','beta']], on='id', how='inner')
    to_correct = to_correct[np.isfinite(to_correct['anova_pval'])].copy()
    print(to_correct.shape, '*- after NA values out... ', to_correct.loc[to_correct['anova_pval']<0.05].shape[0])
    #FDR correction
    to_correct['anova_pval'].astype(float).to_csv('pvalues.txt', sep='\n', index=False)
    Tell = subprocess.call("./fdr_correct.r") #copied the file here
    #Add qvalues to dataframe
    Qval=pd.read_csv('qvalues.txt', sep=' ')
    to_correct['anova_qval']=list(Qval["qvalue"])
                                                        #to_correct['pval']=list(Qval["pvalue"])
    to_correct['significant']=list(Qval['significant'])
    print(sum(to_correct['significant']), 'significant qvalues at 5% FDR', to_correct.loc[to_correct['anova_qval']<0.05].shape)    
    #Verify Delta AIC abd delta BIC
    aic_pass = to_correct.loc[(to_correct['significant']==1 )& (to_correct['delta_aic']>0)]
    bic_pass = to_correct.loc[(to_correct['significant']==1 )& (to_correct['delta_bic']>0)]
    PASS = to_correct.loc[(to_correct['significant']==1 )& (to_correct['delta_aic']>0)& (to_correct['delta_bic']>0)]
    print("Out of ",aic_pass.shape[0],'AIC and ', bic_pass.shape[0],'BIC,', PASS.shape[0],' passed ANOVA successfully (', PASS.shape[0]*100/anova.shape[0],')')    
    tosave=to_correct.drop( ['id'], axis=1)
    print(tosave.loc[tosave['anova_qval']<0.05].shape)
    print('\n-------------------------------')
    command='rm /storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/%s/HH/Adusted_anova_wg.csv'%T
    o=os.system(command)
    tosave.to_csv(os.path.join(DATADIR, T,'HH/ANOVA_pq_wg.csv'), sep='\t', index=None)
    #break
tosave

Adipose-Subcutaneous
(3596, 12) (3454, 14)
(3430, 15) *- after NA values out...  1232
1097 significant qvalues at 5% FDR (1097, 17)
Out of  1097 AIC and  972 BIC, 972  passed ANOVA successfully ( 28.14128546612623 )
(1097, 16)

-------------------------------
Adipose-Visceral
(1842, 12) (1751, 14)
(1729, 15) *- after NA values out...  547
384 significant qvalues at 5% FDR (384, 17)
Out of  384 AIC and  384 BIC, 384  passed ANOVA successfully ( 21.93032552826956 )
(384, 16)

-------------------------------
Artery-Aorta
(2401, 12) (2230, 14)
(2207, 15) *- after NA values out...  722
596 significant qvalues at 5% FDR (596, 17)
Out of  596 AIC and  575 BIC, 575  passed ANOVA successfully ( 25.7847533632287 )
(596, 16)

-------------------------------
Artery-Tibial
(3462, 12) (3298, 14)
(3265, 15) *- after NA values out...  1200
1054 significant qvalues at 5% FDR (1054, 17)
Out of  1054 AIC and  926 BIC, 926  passed ANOVA successfully ( 28.077622801698 )
(1054, 16)

------------------------

Unnamed: 0,chrom,gene,str.start,numsnps,numsamples,r2_str,r2_snp,r2_snpstr,anova_pval,estr_fdr,esnp_fdr,delta_bic,delta_aic,beta,anova_qval,significant
0,chr1,ENSG00000143196.4,168730582,1,328,0.037249,0.037622,0.037777,8.191888e-01,0.056024,1.000000,-5.740197,-1.947184,-0.193000,0.452692,0
1,chr1,ENSG00000160714.5,154499369,1,325,0.041770,0.046213,0.048625,3.669618e-01,0.028920,0.088735,-4.961039,-1.177214,-0.204378,0.280934,0
2,chr1,ENSG00000236624.4,45999774,1,330,0.419333,0.476407,0.476414,9.457413e-01,0.001319,0.000829,-5.794411,-1.995319,-0.647559,0.488096,0
3,chr1,ENSG00000154027.14,77945805,1,316,0.043832,0.107748,0.113690,1.484633e-01,0.055467,0.000829,-3.644393,0.111350,0.206310,0.149532,0
4,chr1,ENSG00000168528.7,31886001,1,321,0.152349,0.158396,0.224034,3.822788e-07,0.001319,0.000829,20.294245,24.065686,0.390319,0.002409,1
5,chr1,ENSG00000158864.8,161165713,1,310,0.068974,0.062081,0.082852,8.794982e-03,0.001512,0.018751,1.205771,4.942344,0.262630,0.018387,1
6,chr1,ENSG00000162627.12,99136427,1,324,0.040755,0.038576,0.041629,3.126128e-01,0.027991,0.009824,-4.750006,-0.969262,0.209661,0.252206,0
7,chr1,ENSG00000060642.6,27040890,1,334,0.072455,0.077606,0.082671,1.773360e-01,0.001319,0.000838,-3.972064,-0.160923,-0.266503,0.170234,0
8,chr1,ENSG00000163485.11,203062735,1,333,0.067745,0.090198,0.093655,2.627153e-01,0.001359,0.000829,-4.540429,-0.732286,0.260278,0.223615,0
9,chr1,ENSG00000154222.10,52795695,1,332,0.104896,0.142659,0.142766,8.395999e-01,0.001319,0.000829,-5.763731,-1.958596,0.330824,0.459238,0


In [None]:
Adipose-Subcutaneous
(4424, 15)
(1697, 15) *- after NA values out...  (87891, 13) (1789, 13) (4424, 14) 32.92342090553382
Number of eSTRs tested against added SNP models 1697
514 significant qvalues at 5% FDR (514, 17)
Out of  514 AIC and  457 BIC, 457  passed ANOVA successfully ( 25.544997205142536 )
(514, 14)

-------------------------------
Adipose-Visceral
(1936, 15)
(821, 15) *- after NA values out...  (87912, 13) (888, 13) (1936, 14) 30.18018018018018
Number of eSTRs tested against added SNP models 821
193 significant qvalues at 5% FDR (193, 17)
Out of  193 AIC and  193 BIC, 193  passed ANOVA successfully ( 21.734234234234233 )
(193, 14)

-------------------------------
Artery-Aorta
(2517, 15)
(1055, 15) *- after NA values out...  (86353, 13) (1139, 13) (2517, 14) 29.41176470588235
Number of eSTRs tested against added SNP models 1055
261 significant qvalues at 5% FDR (261, 17)
Out of  261 AIC and  259 BIC, 259  passed ANOVA successfully ( 22.739244951712028 )
(261, 14)

-------------------------------
Artery-Tibial
(3963, 15)
(1579, 15) *- after NA values out...  (85366, 13) (1689, 13) (3963, 14) 36.17525162818236
Number of eSTRs tested against added SNP models 1579
594 significant qvalues at 5% FDR (594, 17)
Out of  594 AIC and  459 BIC, 459  passed ANOVA successfully ( 27.175843694493782 )
(594, 14)

-------------------------------
Brain-Caudate(basalganglia)
(488, 15)
(236, 15) *- after NA values out...  (87076, 13) (251, 13) (488, 14) 27.888446215139442
Number of eSTRs tested against added SNP models 236
30 significant qvalues at 5% FDR (30, 17)
Out of  30 AIC and  30 BIC, 30  passed ANOVA successfully ( 11.952191235059761 )
(30, 14)

-------------------------------
Brain-Cerebellum
(1340, 15)
(610, 15) *- after NA values out...  (87179, 13) (643, 13) (1340, 14) 24.57231726283048
Number of eSTRs tested against added SNP models 610
96 significant qvalues at 5% FDR (96, 17)
Out of  96 AIC and  96 BIC, 96  passed ANOVA successfully ( 14.930015552099533 )
(96, 14)

-------------------------------
Cells-Transformedfibroblasts
(3566, 15)
(1475, 15) *- after NA values out...  (83075, 13) (1518, 13) (3566, 14) 33.39920948616601
Number of eSTRs tested against added SNP models 1475
446 significant qvalues at 5% FDR (446, 17)
Out of  446 AIC and  377 BIC, 377  passed ANOVA successfully ( 24.835309617918313 )
(446, 14)

-------------------------------
Esophagus-Mucosa
(4338, 15)
(1614, 15) *- after NA values out...  (90840, 13) (1680, 13) (4338, 14) 33.392857142857146
Number of eSTRs tested against added SNP models 1614
491 significant qvalues at 5% FDR (491, 17)
Out of  491 AIC and  425 BIC, 425  passed ANOVA successfully ( 25.297619047619047 )
(491, 14)

-------------------------------
Esophagus-Muscularis
(3312, 15)
(1344, 15) *- after NA values out...  (87608, 13) (1374, 13) (3312, 14) 34.643377001455605
Number of eSTRs tested against added SNP models 1344
439 significant qvalues at 5% FDR (439, 17)
Out of  439 AIC and  382 BIC, 382  passed ANOVA successfully ( 27.802037845705968 )
(439, 14)

-------------------------------
Heart-LeftVentricle
(1840, 15)
(778, 15) *- after NA values out...  (84416, 13) (809, 13) (1840, 14) 30.65512978986403
Number of eSTRs tested against added SNP models 778
221 significant qvalues at 5% FDR (221, 17)
Out of  221 AIC and  209 BIC, 209  passed ANOVA successfully ( 25.834363411619282 )
(221, 14)

-------------------------------
Lung
(3457, 15)
(1373, 15) *- after NA values out...  (89448, 13) (1412, 13) (3457, 14) 31.86968838526912
Number of eSTRs tested against added SNP models 1373
360 significant qvalues at 5% FDR (360, 17)
Out of  360 AIC and  347 BIC, 347  passed ANOVA successfully ( 24.575070821529746 )
(360, 14)

-------------------------------
Muscle-Skeletal
(4156, 15)
(1623, 15) *- after NA values out...  (84241, 13) (1665, 13) (4156, 14) 33.273273273273276
Number of eSTRs tested against added SNP models 1623
510 significant qvalues at 5% FDR (510, 17)
Out of  510 AIC and  415 BIC, 415  passed ANOVA successfully ( 24.924924924924923 )
(510, 14)

-------------------------------
Nerve-Tibial
(5583, 15)
(2139, 15) *- after NA values out...  (88752, 13) (2191, 13) (5583, 14) 35.280693747147424
Number of eSTRs tested against added SNP models 2139
704 significant qvalues at 5% FDR (704, 17)
Out of  704 AIC and  611 BIC, 611  passed ANOVA successfully ( 27.886809675947056 )
(704, 14)

-------------------------------
Skin-NotSunExposed(Suprapubic)
(2473, 15)
(1038, 15) *- after NA values out...  (89011, 13) (1101, 13) (2473, 14) 30.517711171662125
Number of eSTRs tested against added SNP models 1038
271 significant qvalues at 5% FDR (271, 17)
Out of  271 AIC and  262 BIC, 262  passed ANOVA successfully ( 23.79654859218892 )
(271, 14)

-------------------------------
Skin-SunExposed(Lowerleg)
(4578, 15)
(1823, 15) *- after NA values out...  (89048, 13) (1955, 13) (4578, 14) 31.56010230179028
Number of eSTRs tested against added SNP models 1823
468 significant qvalues at 5% FDR (468, 17)
Out of  468 AIC and  443 BIC, 443  passed ANOVA successfully ( 22.65984654731458 )
(468, 14)

-------------------------------
Thyroid
(5111, 15)
(1982, 15) *- after NA values out...  (89139, 13) (2033, 13) (5111, 14) 34.97294638465322
Number of eSTRs tested against added SNP models 1982
650 significant qvalues at 5% FDR (650, 17)
Out of  650 AIC and  559 BIC, 559  passed ANOVA successfully ( 27.49631087063453 )
(650, 14)

-------------------------------
WholeBlood
(3034, 15)
(1210, 15) *- after NA values out...  (81939, 13) (1243, 13) (3034, 14) 34.43282381335479
Number of eSTRs tested against added SNP models 1210
386 significant qvalues at 5% FDR (386, 17)
Out of  386 AIC and  317 BIC, 317  passed ANOVA successfully ( 25.502815768302494 )
(386, 14)

-------------------------------

In [37]:
    '/storage/resources/datasets/gtex/53844/PhenoGenotypeFiles/RootStudyConsentSet_phs000424.GTEx.v6.p1.c1.GRU/'                                   

#alls = pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/gtex650.pca', sep=' ', header=None)
#wb = pd.read_csv('/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/WholeBlood/Corr_Expr.csv').index
#wb
#wb1=alls.loc[alls[0].isin(list(wb))].copy()
wb1.loc[wb1[11]=='Amerindian'].shape   #European 284 #AfricanAmerican 45  #Asian 3 # Unkown 2 # Amerindian 2


(2, 12)

In [34]:
set(list(wb1[11]))

{'AfricanAmerican', 'Amerindian', 'Asian', 'European', 'Unknown'}