In [6]:
import pandas as pd
import numpy as np
import subprocess
import collections
import os

## FDR correction After gene level adjustment
SHORTEN = {
    "Artery-Aorta":"Artery A."     ,
    "Artery-Tibial": "Artery T.",
    "Adipose-Subcutaneous": "Adipose S.",    
    "Adipose-Visceral":"Adipose V.",
    "Brain-Caudate":"Caudate"   , 
    "Brain-Cerebellum":"Cerebellum",
    "Cells-Transformedfibroblasts": "Fibroblast",
    "Esophagus-Mucosa": "E. Mucosa",
    "Esophagus-Muscularis":"E Muscularis",
    "Heart-LeftVentricle":"Ventricule",
    "Lung": "Lung",
    "Muscle-Skeletal": "Muscle",
    "Nerve-Tibial":"Nerve",
    "Skin-NotSunExposed": "Skin Unexposed",
    "Skin-SunExposed":"Skin Leg",
    "Thyroid":"Thyroid",
    "WholeBlood": "Blood"
}

tissu = sorted(SHORTEN.keys())
#path = "/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/"
path = "/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/"

def fdrcorrection(tissue):
    print(tissue, ' variants ...')
    Tell = os.system('mv '+path+tissue+'/PQValues '+path+tissue+'/old_PQValues')
#Get most signif. variant by gene from linear reg STRs
    LR1=pd.read_csv(path+ tissue+"/Lin_Reg_Out" , '\t')
    
#Locus level
    LR1['p.wald'].to_csv('pvalues.txt', sep='\n', index=False)
    Tell = subprocess.call("/home/szfeupe/projects/GTEX_eSTRs/gtex-estrs/Scripts/PostRegressionFixes/fdr-correct.r")
    Qval=pd.read_csv('/home/szfeupe/projects/GTEX_eSTRs/gtex-estrs/Scripts/PostRegressionFixes/qvalues.txt', sep=' ')
    
    LR1['llqvalue']=list(Qval['qvalue'])
    LR1['llsignif']=list(Qval['significant'])

#Gene level
    LR0 = LR1.sort_values("p.wald").groupby("gene", as_index=False).first()     
    print(LR1.shape, '  to  ', LR0.shape)

    #Add counts tests by gene
    counts=pd.DataFrame({'cts' : LR1.groupby(["gene"]).size()})    ## This is the count by genes
    genes = list(LR0['gene'])
    LR0['NTEST']= list(counts.loc[genes]['cts'])
    
    #Gene level adjustment
    #(1) min_pval* #test
    LR0['AD.pval']=LR0['p.wald']*LR0['NTEST']
    #(2) if AD_pval>1 => AD_pval=1
    LR0['AD.pval'][LR0['AD.pval']>1] = 1
    
    #Save pval in file and FDR correct
    LR0['AD.pval'].to_csv('pvalues.txt', sep='\n', index=False)
    Tell = subprocess.call("/home/szfeupe/projects/GTEX_eSTRs/gtex-estrs/Scripts/PostRegressionFixes/fdr-correct.r")
        
    #FDR corrected... add to dataframe
    Qval=pd.read_csv('/home/szfeupe/projects/GTEX_eSTRs/gtex-estrs/Scripts/PostRegressionFixes/qvalues.txt', sep=' ')
    LR0['qvalue']=list(Qval['qvalue'])
    LR0['significant']=list(Qval['significant'])

#Merging
    merging=['gene','chrom','str.id','str.start','beta','beta.se','p.wald','llqvalue','llsignif']
    LRP = pd.merge(LR1,LR0, on=merging, how='left')
    
#Header arrangement
    Head=['gene','chrom','str.id','str.start','p.wald','llqvalue','llsignif','NTEST','qvalue','significant','beta','beta.se']
    Out=LRP[Head]
    Out.to_csv(path+tissue+'/PQValues', sep='\t', index=False)

    S=LR0['AD.pval']
    print(len(S),' total tests... ', len(S[S>=1]) , ' pvalues were reduced to 1')
    print(len(LRP[LRP['qvalue'] <=0.1]),'\t gene level qval<=0.1')
    print(len(LRP[LRP['llqvalue'] <=0.1]),'\t locus level qval<=0.1')
    print(len(LRP[LRP['llqvalue'] <=0.01]),'\t qval<0.01\n')
    return()
#
#
for T in tissu:                     
    fdrcorrection(T)
    

####   After this Script, plot the qqplot and then FDR correction below    
####   Next: We run the code to prepare for heritability analysis (STR+SNP)

Adipose-Subcutaneous  variants ...
(87891, 14)   to   (14712, 14)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


14712  total tests...  2872  pvalues were reduced to 1
1789 	 gene level qval<=0.1
4516 	 locus level qval<=0.1
2027 	 qval<0.01

Adipose-Visceral  variants ...
(87912, 14)   to   (14803, 14)
14803  total tests...  3278  pvalues were reduced to 1
888 	 gene level qval<=0.1
2017 	 locus level qval<=0.1
796 	 qval<0.01

Artery-Aorta  variants ...
(86353, 14)   to   (14609, 14)
14609  total tests...  3076  pvalues were reduced to 1
1139 	 gene level qval<=0.1
2617 	 locus level qval<=0.1
1118 	 qval<0.01

Artery-Tibial  variants ...
(85366, 14)   to   (14372, 14)
14372  total tests...  2722  pvalues were reduced to 1
1689 	 gene level qval<=0.1
4063 	 locus level qval<=0.1
1780 	 qval<0.01

Brain-Caudate  variants ...
(87076, 14)   to   (14939, 14)
14939  total tests...  3680  pvalues were reduced to 1
251 	 gene level qval<=0.1
488 	 locus level qval<=0.1
0 	 qval<0.01

Brain-Cerebellum  variants ...
(87179, 14)   to   (14858, 14)
14858  total tests...  3391  pvalues were reduced to 1
64

WholeBlood  variants ...
(209131, 12)   to   (14845, 12)
14845


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


14845  total tests...  5015  pvalues were reduced to 1

630 	 qval<=0.1
462 	 qval<=0.05
261 	 qval<0.01

Cells-Transformedfibroblasts  variants ...
(216353, 12)   to   (15362, 12)
15362
15362  total tests...  4885  pvalues were reduced to 1

955 	 qval<=0.1
703 	 qval<=0.05
395 	 qval<0.01

Muscle-Skeletal  variants ...
(215860, 12)   to   (15275, 12)
15275
15275  total tests...  5546  pvalues were reduced to 1

304 	 qval<=0.1
205 	 qval<=0.05
0 	 qval<0.01

Lung  variants ...
(225784, 12)   to   (15711, 12)
15711
15711  total tests...  5673  pvalues were reduced to 1

378 	 qval<=0.1
264 	 qval<=0.05
0 	 qval<0.01

Adipose-Subcutaneous  variants ...
(225770, 12)   to   (15918, 12)
15918
15918  total tests...  5751  pvalues were reduced to 1

376 	 qval<=0.1
287 	 qval<=0.05
115 	 qval<0.01

Artery-Tibial  variants ...
(221263, 12)   to   (15584, 12)
15584
15584  total tests...  5360  pvalues were reduced to 1

524 	 qval<=0.1
383 	 qval<=0.05
196 	 qval<0.01

Esophagus-Mucosa  varia