In [1]:
# Libraries
import pandas as pd
import numpy as np
import vcf
#

#    "Thyroid": "green",
SHORTEN = {
    "Artery-Aorta":"Artery.A"     ,
    "Artery-Tibial": "Artery.T",
    "Adipose-Subcutaneous": "Adipose.S",    
    "Adipose-Visceral(Omentum)":"Adipose.V",
    "Brain-Caudate(basalganglia)":"Caudate"   , 
    "Brain-Cerebellum":"Cerebellum",
    "Cells-Transformedfibroblasts": "Fibroblast",
    "Esophagus-Mucosa": "Mucosa",
    "Esophagus-Muscularis":"Muscularis",
    "Heart-LeftVentricle":"Heart",
    "Lung": "Lung",
    "Muscle-Skeletal": "Muscle",
    "Nerve-Tibial":"Nerve",
    "Skin-NotSunExposed(Suprapubic)": "SkinUnexposed",
    "Skin-SunExposed(Lowerleg)":"SkinLeg",
    "Thyroid":"Thyroid",
    "WholeBlood": "Blood",
    "permuted":"Permuted",
}

GENE = '/storage/szfeupe/Runs/GTEx_estr/FEATURES/Genes_only_table'
PATH= "/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/"
TISSUES = sorted([item for item in list(SHORTEN.keys()) if item != "permuted"])
motif='/storage/resources/dbase/human/hg19/hg19.hipstr_reference_withmotif.bed'

In [2]:
#Linear regression QVALUES
#
GENES = pd.read_csv(GENE, sep='\t')
col=['chrom','str.start','str.id','gene']
SUMMARY = pd.read_csv(PATH +'WholeBlood/PQValues', sep = '\t')[col]
estrs = []
SUM = {}
#Merging by tissues 
for T in TISSUES:
    table = pd.read_csv(PATH +T+'/PQValues', sep = '\t')
    table['eSTR.'+SHORTEN[T]+'.ll'] = np.where(table['llqvalue']<=0.1, 1, 0)  #10%fdr at locus level
    table['eSTR.'+SHORTEN[T]] = np.where(table['qvalue']<=0.1, 1, 0)
    table['qval.'+SHORTEN[T]] = table['qvalue'].astype(float)
    table['llqval.'+SHORTEN[T]] = table['llqvalue'].astype(float)
    table['beta.'+SHORTEN[T]] = table['beta'].astype(float)
    table = table[col + ['eSTR.'+SHORTEN[T], 'eSTR.'+SHORTEN[T]+'.ll', 'qval.'+SHORTEN[T],'llqval.'+SHORTEN[T],'beta.'+SHORTEN[T]]]
    table['str.start'].astype(int)
    table['chrom'].astype(str)
    #estrs.append('eSTR.'+SHORTEN[T])
    SUM[T]=table
    table['eSTR.'+SHORTEN[T]] = table['eSTR.'+SHORTEN[T]].astype(int)
    SUMMARY = pd.merge(SUMMARY,table[col], on=col, how='outer').drop_duplicates().reset_index(drop=True)
    #print(table.shape[0],'\t',SUMMARY.shape,'\t',table.loc[table['eSTR.'+SHORTEN[T]]==1].shape[0],'\t', T)

SUMMARY = SUMMARY.sort_values(['chrom','gene', 'str.start']).reset_index(drop=True)
SUMMARY['E.tissues']=[0]*SUMMARY.shape[0]
SUMMARY['ll.E.tissues']=[0]*SUMMARY.shape[0]
for T in TISSUES :
    SUMMARY = pd.merge(SUMMARY.fillna(0), SUM[T], how='left',on=col)
    SUMMARY['E.tissues'] = SUMMARY['E.tissues'].fillna(0) + SUMMARY['eSTR.'+SHORTEN[T]]
    SUMMARY['ll.E.tissues'] = SUMMARY['ll.E.tissues'].fillna(0) + SUMMARY['eSTR.'+SHORTEN[T]+'.ll']


print('Number of egenes... ',SUMMARY.loc[SUMMARY['E.tissues']>=1].shape)
print('Number of egenes by locus... ',SUMMARY.loc[SUMMARY['ll.E.tissues']>=1].shape)
print('\nNbr of eGenes in more than 1 tissues ',SUMMARY.loc[SUMMARY['E.tissues']>=2].shape)
print('Nbr of eGenes by locus in more than 1 tissues ',SUMMARY.loc[SUMMARY['ll.E.tissues']>=2].shape)
print('\negenes in at least 10 tissues ',SUMMARY.loc[SUMMARY['E.tissues']>=10].shape)
print('egenes in at least 10 tissues (locus)',SUMMARY.loc[SUMMARY['ll.E.tissues']>=10].shape)

add_info = pd.read_csv(motif, sep='\t', header=None)
add_info.columns=['chrom','str.start','str.end','period','motif']
print(add_info.shape)

SUMMARY = pd.merge(SUMMARY, add_info, on=['chrom','str.start'], how='inner')

e_homopolymers = SUMMARY.loc[(SUMMARY['E.tissues']>=1) & (SUMMARY['period']==1)]

print(SUMMARY.shape, e_homopolymers.shape)

SUMMARY.to_csv('/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/LR_SummaryTest_Table.tsv', sep='\t', index=False)


Number of egenes...  (9446, 91)
Number of egenes by locus...  (19566, 91)

Nbr of eGenes in more than 1 tissues  (3663, 91)
Nbr of eGenes by locus in more than 1 tissues  (8768, 91)

egenes in at least 10 tissues  (319, 91)
egenes in at least 10 tissues (locus) (1090, 91)
(1620037, 5)
(98671, 94) (3416, 94)


In [5]:
#Number of eSTRs by STR unit size
sumtable = {}
sumtable['Unit.size'] = [x for x in range(1,7,1)]
sumtable['all.tissues'] = [ SUMMARY.loc[(SUMMARY['E.tissues']>=1) & (SUMMARY['period']==i)].shape[0] for i in range(1,7,1)]

for T in TISSUES:
    sumtable[SHORTEN[T]] = [ SUMMARY.loc[(SUMMARY['eSTR.'+SHORTEN[T]]>=1) & (SUMMARY['period']==i)].shape[0] for i in range(1,7,1)]
    
estr_by_unit = pd.DataFrame(sumtable)[['Unit.size', 'all.tissues']+ [SHORTEN[T] for T in TISSUES]]
estr_by_unit.index = list(estr_by_unit['Unit.size'])
estr_by_unit['Unit.size']

estr_by_unit

Unnamed: 0,Unit.size,all.tissues,Adipose.S,Adipose.V,Artery.A,Artery.T,Caudate,Cerebellum,Fibroblast,Mucosa,Muscularis,Heart,Lung,Muscle,Nerve,SkinUnexposed,SkinLeg,Thyroid,Blood
1,1,3416,647,328,437,610,97,217,532,571,501,302,530,601,788,405,712,733,476
2,2,2907,531,275,355,521,73,214,455,506,438,230,425,479,682,343,583,621,344
3,3,594,135,60,71,105,19,36,117,120,98,54,96,107,145,80,140,135,86
4,4,1767,324,157,175,318,42,118,283,329,228,158,242,321,402,184,369,379,220
5,5,508,107,46,67,93,16,47,88,95,72,45,75,106,128,68,104,114,78
6,6,254,45,22,34,42,4,11,43,59,37,20,44,51,46,21,47,51,39


In [14]:
E = SUMMARY.loc[SUMMARY['E.tissues']>=1]
U = SUMMARY.loc[SUMMARY['E.tissues']==1]
S = SUMMARY.loc[SUMMARY['E.tissues']>1]
E1 = E.loc[E['period']==1]
print('Total # eSTRs',E.shape[0],'\nUnique to tissues', U.shape[0],'\nShared eSTRs', S.shape[0])
H = U.loc[U['period']==1]; S1 = S.loc[S['period']==1]
print('Total # of Unique to tissue eSTRs',U.shape[0],'\nNumber of homopolymers ', H.shape[0],'\n Shared homopolymers eSTRs', S1.shape[0])

E1.shape

Total # eSTRs 9446 
Unique to tissues 5783 
Shared eSTRs 3663
Total # of Unique to tissue eSTRs 5783 
Number of homopolymers  2089 
 Shared homopolymers eSTRs 1327


(3416, 94)

In [15]:
100*3416/9446

36.16345543087021

In [None]:
##This code summarises the Anova results and prepare file for anovapvalue qqplot 
##
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as ss

Tissue="WholeBlood"
##delta_bic = snp_bic - snpstr_bic        ##delta_aic = snp_aic - snpstr_aic 
##              the lower tht aic or bic the closer the data is to the model
print('Summary for '+Tissue+' ...\n')

#FDR 
pval = pd.read_csv("/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/"+Tissue+"/PQValues.txt", sep='\t')
pval = pval.loc[pval['qvalue']<=0.05]  ##eSTRs at 10% FDR
print(pval.shape[0],' eSTRs at 10% FDR')

#Anova output 
hh = pd.read_csv("/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/"+Tissue+"/HH/Anova_wg_old.csv", sep=',')
hh1 = hh.loc[hh['gene'].isin(pval['gene'])] 
print(hh1.shape[0], ' eSTRs and their anova p-values')

hh1['st1']=[0]*len(hh1)
hh1['st2']=[0]*len(hh1)
hh1['st1'][hh1['delta_aic']>0] = 1
hh1['st2'][hh1['delta_bic']>0] = 2
hh1['strprefer']=hh1['st1']+hh1['st2']


hh1.to_csv("/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/"+Tissue+"/HH/ESTRs_Anova", sep='\t', index=False)

hh1