In [1]:
# Libraries
import pandas as pd
import numpy as np
import vcf
#
COLORS = {
    "Artery-Aorta":"salmon",
    "Artery-Tibial": "red",
    "Adipose-Subcutaneous": "darkorange",    
    "Adipose-Visceral(Omentum)":"orange",
    "Brain-Caudate(basalganglia)":"lemonchiffon"   , 
    "Brain-Cerebellum":"yellow",
    "Cells-Transformedfibroblasts": "skyblue",
    "Esophagus-Mucosa": "sienna",
    "Esophagus-Muscularis":"burlywood",
    "Heart-LeftVentricle":"darkviolet",
    "Lung": "greenyellow",
    "Muscle-Skeletal": "mediumslateblue",
    "Nerve-Tibial":"gold",
    "Skin-NotSunExposed(Suprapubic)":"blue",
    "Skin-SunExposed(Lowerleg)":"cornflowerblue",
    "Thyroid":"green",
    "WholeBlood": "m",
    "permuted": "gray"
}

#    "Thyroid": "green",
SHORTEN = {
    "Artery-Aorta":"Artery.A"     ,
    "Artery-Tibial": "Artery.T",
    "Adipose-Subcutaneous": "Adipose.S",    
    "Adipose-Visceral(Omentum)":"Adipose.V",
    "Brain-Caudate(basalganglia)":"Caudate"   , 
    "Brain-Cerebellum":"Cerebellum",
    "Cells-Transformedfibroblasts": "Fibroblast",
    "Esophagus-Mucosa": "Mucosa",
    "Esophagus-Muscularis":"Muscularis",
    "Heart-LeftVentricle":"Ventricule",
    "Lung": "Lung",
    "Muscle-Skeletal": "Muscle",
    "Nerve-Tibial":"Nerve",
    "Skin-NotSunExposed(Suprapubic)": "SkinUnexposed",
    "Skin-SunExposed(Lowerleg)":"SkinLeg",
    "Thyroid":"Thyroid",
    "WholeBlood": "Blood",
    "permuted":"Permuted",
    "LCL": "LCL"
}

GENE = '/storage/szfeupe/Runs/GTEx_estr/FEATURES/Genes_only_table'
PATH= "/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/"
TISSUES = sorted([item for item in list(COLORS.keys()) if item != "permuted"])
motif='/storage/resources/dbase/human/hg19/hg19.hipstr_reference_withmotif.bed'

In [4]:
#Linear regression QVALUES
#
GENES = pd.read_csv(GENE, sep='\t')
col=['chrom','str.start','str.id','gene']
SUMMARY = pd.read_csv(PATH +'WholeBlood/PQValues', sep = '\t')[col]
estrs = []
SUM = {}
#Merging by tissues 
for T in TISSUES:
    table = pd.read_csv(PATH +T+'/PQValues', sep = '\t')
    table['eSTR.'+SHORTEN[T]+'.ll'] = np.where(table['llqvalue']<=0.1, 1, 0)  #10%fdr at locus level
    table['eSTR.'+SHORTEN[T]] = np.where(table['qvalue']<=0.1, 1, 0)
    table['qval.'+SHORTEN[T]] = table['qvalue'].astype(float)
    table['llqval.'+SHORTEN[T]] = table['llqvalue'].astype(float)
    table['beta.'+SHORTEN[T]] = table['beta'].astype(float)
    table = table[col + ['eSTR.'+SHORTEN[T], 'eSTR.'+SHORTEN[T]+'.ll', 'qval.'+SHORTEN[T],'llqval.'+SHORTEN[T],'beta.'+SHORTEN[T]]]
    table['str.start'].astype(int)
    table['chrom'].astype(str)
    #estrs.append('eSTR.'+SHORTEN[T])
    SUM[T]=table
    table['eSTR.'+SHORTEN[T]] = table['eSTR.'+SHORTEN[T]].astype(int)
    SUMMARY = pd.merge(SUMMARY,table[col], on=col, how='outer').drop_duplicates().reset_index(drop=True)
    print(table.shape[0],'\t',SUMMARY.shape,'\t',table.loc[table['eSTR.'+SHORTEN[T]]==1].shape[0],'\t', T)

SUMMARY = SUMMARY.sort_values(['chrom','gene', 'str.start']).reset_index(drop=True)
SUMMARY['E.tissues']=[0]*SUMMARY.shape[0]
SUMMARY['ll.E.tissues']=[0]*SUMMARY.shape[0]
for T in TISSUES :
    SUMMARY = pd.merge(SUMMARY.fillna(0), SUM[T], how='left',on=col)
    SUMMARY['E.tissues'] = SUMMARY['E.tissues'].fillna(0) + SUMMARY['eSTR.'+SHORTEN[T]]
    SUMMARY['ll.E.tissues'] = SUMMARY['ll.E.tissues'].fillna(0) + SUMMARY['eSTR.'+SHORTEN[T]+'.ll']


print('Number of egenes... ',SUMMARY.loc[SUMMARY['E.tissues']>=1].shape)
print('Number of egenes by locus... ',SUMMARY.loc[SUMMARY['ll.E.tissues']>=1].shape)
print('\nNbr of eGenes in more than 1 tissues ',SUMMARY.loc[SUMMARY['E.tissues']>=2].shape)
print('Nbr of eGenes by locus in more than 1 tissues ',SUMMARY.loc[SUMMARY['ll.E.tissues']>=2].shape)
print('\negenes in at least 10 tissues ',SUMMARY.loc[SUMMARY['E.tissues']>=10].shape)
print('egenes in at least 10 tissues (locus)',SUMMARY.loc[SUMMARY['ll.E.tissues']>=10].shape)

add_info = pd.read_csv(motif, sep='\t', header=None)
add_info.columns=['chrom','str.start','str.end','period','motif']
print(add_info.shape)

SUMMARY = pd.merge(SUMMARY, add_info, on=['chrom','str.start'], how='inner')

e_homopolymers = SUMMARY.loc[(SUMMARY['E.tissues']>=1) & (SUMMARY['period']==1)]

print(SUMMARY.shape, e_homopolymers.shape)

SUMMARY.to_csv('/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/LR_SummaryTest_Table.tsv', sep='\t', index=False)


87891 	 (89629, 4) 	 1789 	 Adipose-Subcutaneous
87912 	 (90740, 4) 	 888 	 Adipose-Visceral(Omentum)
86353 	 (91326, 4) 	 1139 	 Artery-Aorta
85366 	 (91420, 4) 	 1689 	 Artery-Tibial
87076 	 (93517, 4) 	 251 	 Brain-Caudate(basalganglia)
87179 	 (94316, 4) 	 643 	 Brain-Cerebellum
83075 	 (94509, 4) 	 1518 	 Cells-Transformedfibroblasts
90840 	 (97786, 4) 	 1680 	 Esophagus-Mucosa
87608 	 (97896, 4) 	 1374 	 Esophagus-Muscularis
84416 	 (97969, 4) 	 809 	 Heart-LeftVentricle
89448 	 (98157, 4) 	 1412 	 Lung
84241 	 (98215, 4) 	 1665 	 Muscle-Skeletal
88752 	 (98291, 4) 	 2191 	 Nerve-Tibial
89011 	 (98497, 4) 	 1101 	 Skin-NotSunExposed(Suprapubic)
89048 	 (98509, 4) 	 1955 	 Skin-SunExposed(Lowerleg)
89139 	 (98671, 4) 	 2033 	 Thyroid
81939 	 (98671, 4) 	 1243 	 WholeBlood
Number of egenes...  (9446, 91)
Number of egenes by locus...  (19566, 91)

Nbr of eGenes in more than 1 tissues  (3663, 91)
Nbr of eGenes by locus in more than 1 tissues  (8768, 91)

egenes in at least 10 tissues

In [8]:
#Number of eSTRs by STR unit size
for i in range(1,7,1):
    h = SUMMARY.loc[(SUMMARY['E.tissues']>=1) & (SUMMARY['period']==i)].shape[0]
    for T in TISSUES:
        SUMMARY.loc[(SUMMARY['E.tissues']>=1) & (SUMMARY['period']==i)]
    

1 	 (3416, 94)
2 	 (2907, 94)
3 	 (594, 94)
4 	 (1767, 94)
5 	 (508, 94)
6 	 (254, 94)


In [None]:
SUMMARY.loc[SUMMARY['gene']=='ENSG00000000460.12']

In [None]:
see=0 #Add the Alleles 
see=pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/STR_Locus_Alleles.tsv', sep='\t')#q, header=None)
#see.columns=['chrom','str.start', "allele.REF",'allele.ALT','motif_len','ref_counts']
see['str.id']='str_'+see['str.start'].astype(str)
C=['chrom','str.start','str.id',"allele.REF",'allele.ALT','motif_len','ref_counts']
See=see[C]
del See['str.id']
cols = ['chrom','str.start']
OUTs = pd.merge(OUT, See, how='left',on=cols)
del OUTs['ID']
OUTs.shape

In [None]:
cols=['chrom','str.start','str.id','gene','ID']
#Merge to add beta and pvalues by tissue
OUT1 = pd.merge(OUTs, wb, how='left',on=cols[:4])
OUT2 = pd.merge(OUT1, ads, how='left',on=cols) ; OUT1=0
OUT1 = pd.merge(OUT2, ctf, how='left',on=cols) ; OUT2=0
OUT2 = pd.merge(OUT1, esm, how='left',on=cols) ; OUT1=0
OUT1 = pd.merge(OUT2, ms, how='left',on=cols)  ; OUT2=0
OUT2 = pd.merge(OUT1, lng, how='left',on=cols) ; OUT1=0
OUT1 = pd.merge(OUT2, art, how='left',on=cols)
OUT1.shape

In [None]:
#Merge to add qvalues and eSTRs by tissue
OUT3 = pd.merge(OUT1, wbp, how='left',on=cols)
OUT2 = pd.merge(OUT3, adsp, how='left',on=cols) ; OUT3=0
OUT3 = pd.merge(OUT2, ctfp, how='left',on=cols) ; OUT2=0
OUT2 = pd.merge(OUT3, esmp, how='left',on=cols) ; OUT3=0
OUT3 = pd.merge(OUT2, msp, how='left',on=cols)  ; OUT2=0
OUT2 = pd.merge(OUT3, lngp, how='left',on=cols) ; OUT3=0
OUT3 = pd.merge(OUT2, artp, how='left',on=cols)
OUT3.shape

In [None]:
#Label estrs and non estrs by tissue
OUT3['Blood_eSTR?'][OUT3['Blood_eSTR?']!=1] = 0
OUT3['AdipSub_eSTR?'][OUT3['AdipSub_eSTR?']!=1] = 0
OUT3['ArteryT_eSTR?'][OUT3['ArteryT_eSTR?']!=1] = 0
OUT3['Esophagus_eSTR?'][OUT3['Esophagus_eSTR?']!=1] = 0
OUT3['Fibroblast_eSTR?'][OUT3['Fibroblast_eSTR?']!=1] = 0
OUT3['Lung_eSTR?'][OUT3['Lung_eSTR?']!=1] = 0
OUT3['MuscleS_eSTR?'][OUT3['MuscleS_eSTR?']!=1] = 0
OUT3['eSTR_In_tissues'] = OUT3['Blood_eSTR?']+OUT3['Fibroblast_eSTR?']+OUT3['Lung_eSTR?']+OUT3['MuscleS_eSTR?']+OUT3['AdipSub_eSTR?']+OUT3['ArteryT_eSTR?']+OUT3['Esophagus_eSTR?'] 
OUT3.to_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/SUMMARY/LR_SummaryTest_Table.tsv', sep='\t', index=False)
#& (OUT3['Blood_eSTR?']==1)& (OUT3['MuscleS_eSTR?']==1)]

In [None]:
#OUT3['Estrs']=OUT3['Fibroblast_eSTR?']+OUT3['Blood_eSTR?']+OUT3['MuscleS_eSTR?']+OUT3['Lung_eSTR?']+OUT3['Esophagus_eSTR?']+OUT3['ArteryT_eSTR?']+OUT3['AdipSub_eSTR?']
Gene_table = pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/FEATURES/Genes_only_table', sep='\t')
OUT = pd.merge(OUT3, Gene_table[["gene","chrom","gene.name"]], on=["chrom","gene"])
#OUT3=OUT
OUT4=OUT.loc[OUT['eSTR_In_tissues']>=1]
OUT4.to_csv('/storage/szfeupe/Runs/GTEx_estr/eSTRs_Tests_Table.tsv', index=False)

OUT.to_csv('/storage/szfeupe/Runs/GTEx_estr/LR_SummaryTest_Table.tsv', sep='\t', index=False)
print('PP')
print(OUT4.shape)
OUT3.loc[(OUT3['eSTR_In_tissues']>=7)]

In [None]:
OUT.loc[OUT["eSTR_In_tissues"]>=1].shape

In [None]:
#(2) in permuted
idx=list(LR.index)
#LRp1 = LRp.loc[LRp['gene'].isin(list(LR['gene'])) & LRp['str.id'].isin(list(LR['str.id']))]
LRp1=LRp.loc[idx]

#select qqplot axis
X= LR['logpval']
Y= LRp1['logpval']

plt.figure(1)
#plot qqplot
sm.qqplot(X, line='45')
sm.qqplot(Y, line='45')
smg.gofplots.qqplot_2samples(X,Y, line='q', xlabel='Observed (log(pval))', ylabel='Permuted (log(pval))')
pylab.show()
#
plt.scatter(X, Y)
plt.ylabel('Observed')
plt.xlabel('Permuted')
plt.show()


In [None]:
##This code summarises the Anova results and prepare file for anovapvalue qqplot 
##
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as ss

Tissue="WholeBlood"
##delta_bic = snp_bic - snpstr_bic        ##delta_aic = snp_aic - snpstr_aic 
##              the lower tht aic or bic the closer the data is to the model
print('Summary for '+Tissue+' ...\n')

#FDR 
pval = pd.read_csv("/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/"+Tissue+"/PQValues.txt", sep='\t')
pval = pval.loc[pval['qvalue']<=0.05]  ##eSTRs at 10% FDR
print(pval.shape[0],' eSTRs at 10% FDR')

#Anova output 
hh = pd.read_csv("/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/"+Tissue+"/HH/Anova_wg_old.csv", sep=',')
hh1 = hh.loc[hh['gene'].isin(pval['gene'])] 
print(hh1.shape[0], ' eSTRs and their anova p-values')

hh1['st1']=[0]*len(hh1)
hh1['st2']=[0]*len(hh1)
hh1['st1'][hh1['delta_aic']>0] = 1
hh1['st2'][hh1['delta_bic']>0] = 2
hh1['strprefer']=hh1['st1']+hh1['st2']


hh1.to_csv("/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/"+Tissue+"/HH/ESTRs_Anova", sep='\t', index=False)

hh1

In [None]:
print(OUT3.loc[(OUT3['Blood_eSTR?']==1) & (OUT3['eSTR_In_tissues']==7)].shape)
print(OUT3.loc[(OUT3['AdipSub_eSTR?']==1) & (OUT3['eSTR_In_tissues']==7)].shape)
print(OUT3.loc[(OUT3['ArteryT_eSTR?']==1) & (OUT3['eSTR_In_tissues']>=5)].shape)
print(OUT3.loc[(OUT3['Esophagus_eSTR?']==1) & (OUT3['eSTR_In_tissues']>=5)].shape)
print(OUT3.loc[(OUT3['Fibroblast_eSTR?']==1) & (OUT3['eSTR_In_tissues']>=5)].shape)
print(OUT3.loc[(OUT3['Lung_eSTR?']==1) & (OUT3['eSTR_In_tissues']>=5)].shape)
print(OUT3.loc[(OUT3['MuscleS_eSTR?']==1) & (OUT3['eSTR_In_tissues']>=5)].shape)

In [None]:

#wholeblood
wb = pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/Lin_Reg_OutFin.txt', sep='\t')
wb['ID']=wb['gene']+wb['str.id']
wb = wb[col]; wb.columns=['chrom','str.start','str.id','gene','ID','beta_blood','blood_pval']
#adipose subcutaneous
ads= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Adipose-Subcutaneous/Lin_Reg_OutFin.txt', sep='\t')
ads['ID']=ads['gene']+ads['str.id']
ads = ads[col]; ads.columns=['chrom','str.start','str.id','gene','ID','beta_AdipSub','AdipSub_pval']
#cell T Fibroblast
ctf= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Cells-Transformedfibroblasts/Lin_Reg_OutFin.txt', sep='\t')
ctf['ID']=ctf['gene']+ctf['str.id']
ctf = ctf[col]; ctf.columns=['chrom','str.start','str.id','gene','ID','beta_Fibroblast','Fibroblast_pval']
#Esophagus mucosa
esm= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Esophagus-Mucosa/Lin_Reg_OutFin.txt', sep='\t')
esm['ID']=esm['gene']+esm['str.id']
esm = esm[col]; esm.columns=['chrom','str.start','str.id','gene','ID','beta_Esophag','Esophag_pval']
#Muscle Skeletal
ms = pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Muscle-Skeletal/Lin_Reg_OutFin.txt', sep='\t')
ms['ID']=ms['gene']+ms['str.id']
ms = ms[col] ; ms.columns=['chrom','str.start','str.id','gene','ID','beta_MuscleS','MuscleS_pval']
#Lung
lng= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Lung/Lin_Reg_OutFin.txt', sep='\t')
lng['ID']=lng['gene']+lng['str.id']
lng = lng[col]; lng.columns=['chrom','str.start','str.id','gene','ID','beta_Lung','Lung_pval']
#Arterial Tibial
art= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Artery-Tibial/Lin_Reg_OutFin.txt', sep='\t')
art['ID']=art['gene']+art['str.id']
art = art[col]; art.columns=['chrom','str.start','str.id','gene','ID','beta_ArteryT','ArteryT_pval']
rows=['chrom','str.start','str.id','gene','ID']
OUT=pd.concat([wb[rows],ads[rows],ctf[rows],esm[rows],ms[rows],lng[rows],art[rows]]).drop_duplicates().reset_index(drop=True)
print(wb.shape[0], ads.shape[0],ctf.shape[0], esm.shape[0],ms.shape[0],lng.shape[0],art.shape[0])
#
#FDR_correction
#wbp['Blood_eSTR?'] = np.where(wbp['qvalue']<=0.1, 1, 0)

wbp = pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/PQValues.txt', sep='\t')
wbp['Blood_eSTR?']= np.where(wbp['qvalue']<=0.1, 1, 0); wbp['ID']=wbp['gene']+wbp['str.id'] ;wbp['blood_qval']=wbp['qvalue']
Col=['chrom','str.start','str.id','gene','ID', 'blood_qval', 'Blood_eSTR?']; wbp=wbp[Col]
print('Whole blood',wbp.shape[0])
adsp= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Adipose-Subcutaneous/PQValues.txt', sep='\t')
adsp['AdipSub_eSTR?']= np.where(adsp['qvalue']<=0.1, 1, 0);  adsp['ID']= adsp['gene']+ adsp['str.id']; adsp['AdipSub_qval']=  adsp['qvalue']
Col=['chrom','str.start','str.id','gene','ID', 'AdipSub_qval', 'AdipSub_eSTR?']; adsp=adsp[Col]
print('Adipose Subcutanous',adsp.shape[0])
ctfp= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Cells-Transformedfibroblasts/PQValues.txt', sep='\t')
ctfp['Fibroblast_eSTR?']= np.where(ctfp['qvalue']<=0.1, 1, 0);  ctfp['ID']= ctfp['gene']+ ctfp['str.id']; ctfp['Fibroblast_qval']=  ctfp['qvalue']
Col=['chrom','str.start','str.id','gene','ID', 'Fibroblast_qval', 'Fibroblast_eSTR?']; ctfp= ctfp[Col]
print('Cell Fibroblast',ctfp.shape[0])
esmp= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Esophagus-Mucosa/PQValues.txt', sep='\t')
esmp['Esophagus_eSTR?']= np.where(esmp['qvalue']<=0.1, 1, 0); esmp['ID']= esmp['gene']+ esmp['str.id']; esmp['Esophagus_qval']=  esmp['qvalue']
Col=['chrom','str.start','str.id','gene','ID', 'Esophagus_qval', 'Esophagus_eSTR?']; esmp = esmp[Col]
print('Esophagous Mucosa',esmp.shape[0])
msp = pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Muscle-Skeletal/PQValues.txt', sep='\t')
msp['MuscleS_eSTR?']= np.where(msp['qvalue']<=0.1, 1, 0);  msp['ID']=  msp['gene']+  msp['str.id'];  msp['MuscleS_qval']= msp['qvalue']
Col=['chrom','str.start','str.id','gene','ID', 'MuscleS_qval', 'MuscleS_eSTR?']; msp = msp[Col]
print('Muscle skeletal',msp.shape[0])
lngp= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Lung/PQValue.tsv', sep='\t')
lngp['Lung_eSTR?']= np.where(lngp['qvalue']<=0.1, 1, 0);   lngp['ID']=  lngp['gene']+  lngp['str.id'];  lngp['Lung_qval']= lngp['qvalue']
Col=['chrom','str.start','str.id','gene','ID', 'Lung_qval', 'Lung_eSTR?']; lngp = lngp[Col]
print('Lungs', lngp.shape[0])
artp= pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Artery-Tibial/PQValues.txt', sep='\t')
artp['ArteryT_eSTR?']=  np.where(artp['qvalue']<=0.1, 1, 0); artp['ID']= artp['gene']+ artp['str.id']; artp['ArteryT_qval']= artp['qvalue']
Col=['chrom','str.start','str.id','gene','ID', 'ArteryT_qval', 'ArteryT_eSTR?'];  artp= artp[Col]
print('Artery tibial', artp.shape[0])
OUT['str.start'] = OUT['str.start'].astype(int)
OUT.shape