In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import subprocess

def round1adjust(annot, linreg):
    #Set up empty dataframe
    index=['0']
    Out_Lin_reg = pd.DataFrame(index=index, columns=linreg.columns)
    Out_Lin_reg = Out_Lin_reg.fillna(0)
    Out_Lin_reg['PVAL']=0
    Test=[]        #Number of STRs tested for a gene
    Notest=[]      #Gene twith no tested genes
    for index, gene in annot.iterrows():
        geneid = gene['gene.id']
        start = gene['gene.start']
        stop = gene['gene.stop']
#        strs = linreg.loc[linreg['str.start'].isin(range(start,stop))]
        strs = linreg.loc[linreg['gene'].isin([geneid])]
        N = len(strs)
#identifying the STR test with smallest pvalue
        if N>0 :
            ind_low = strs.loc[strs['p.wald'].idxmin()]
            lowpval = strs.loc[[ind_low.name]]
#adjusting by the number of tests
            Adj_pval = lowpval['p.wald']*N
            lowpval['PVAL'] = Adj_pval 
#Append adjusted STR in the output for final FDR adjustment
            Out_Lin_reg.loc[len(Out_Lin_reg)] = lowpval.values.tolist()[0]
            Test.append(N)
    
        else:
            Notest.append(gene['gene.id']) 
    Out_Lin_reg = Out_Lin_reg.drop('0')
    print(len(Notest), ' genes were not tested for eSTRs')
    print(len(Test), ' genes were tested for eSTRs\n') 
    return(Out_Lin_reg, Notest, Test)

In [2]:
Input='/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/Cells-Transformedfibroblasts/'

#open gene annotation
#Looking at these columns gene.chr	gene.start	gene.stop	gene.id
Annot = pd.read_csv('~/projects/GTEX_eSTRs/data/Lin_Reg/Gene_Exp_Annotation.txt', sep=',')
#open Linear regression output
#Looking at these columns gene	chrom	str.start	p.wald
Linreg = pd.read_csv(Input+'/Lin_Reg_Out', sep='\t')
#Set up the test set
chrom = ['chr'+str(i) for i in range(1,22)]
chrom.append('chrX') ; chrom.append('chrY')
##chrom=['chr10']
#Setting up
index=['0']
Adjusted = pd.DataFrame(index=index, columns=Linreg.columns)
Adjusted = Adjusted.fillna(0)
print (Adjusted.shape)
NT = []
#phase1 single out the STR with lowest pval and adjust by chromosome
for ch in chrom:
    A = Annot.loc[Annot['gene.chr'].isin([ch])]
    LR = Linreg.loc[Linreg['chrom'].isin([ch])]
#    print(ch,' ',len(A), len(LR))
    Adjval, GTEST, NTest = round1adjust(A, LR)
#    Adjval.drop('0')
    Adjusted = pd.concat([Adjusted, Adjval])#Adjusted.append(Adjval, ignore_index=True)
    NT =NT + NTest
    print('Done with ', ch, '\t', Adjval.shape,' TO... ',Adjusted.shape)
print('End')
Adjusted.to_csv('LinReg_adj.tsv', sep='\t')


Out_Lin_reg=Adjusted.drop('0')
#SOmetime, the p-values multiplied by the number of STRs is >1 
Out_Lin_reg['NTest']=NT
Out_Lin_reg['PVAL'][Out_Lin_reg['PVAL']>1] = 1
PVAL=Out_Lin_reg['PVAL']
PVAL.to_csv('pvalues.txt', sep='\n', index=False)
print(len(PVAL), ' Total number of tests')
print(len(PVAL[PVAL>=1]),' Total number of test with pval reduced to 1')

(1, 13)
326  genes were not tested for eSTRs
1672  genes were tested for eSTRs

Done with  chr1 	 (1672, 14)  TO...  (1673, 14)
161  genes were not tested for eSTRs
1069  genes were tested for eSTRs

Done with  chr2 	 (1069, 14)  TO...  (2742, 14)
139  genes were not tested for eSTRs
902  genes were tested for eSTRs

Done with  chr3 	 (902, 14)  TO...  (3644, 14)
118  genes were not tested for eSTRs
611  genes were tested for eSTRs

Done with  chr4 	 (611, 14)  TO...  (4255, 14)
123  genes were not tested for eSTRs
737  genes were tested for eSTRs

Done with  chr5 	 (737, 14)  TO...  (4992, 14)
171  genes were not tested for eSTRs
828  genes were tested for eSTRs

Done with  chr6 	 (828, 14)  TO...  (5820, 14)
132  genes were not tested for eSTRs
754  genes were tested for eSTRs

Done with  chr7 	 (754, 14)  TO...  (6574, 14)
109  genes were not tested for eSTRs
556  genes were tested for eSTRs

Done with  chr8 	 (556, 14)  TO...  (7130, 14)
119  genes were not tested for eSTRs
648  ge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
#Now, we use the QVALUE package to adjust the pvalues and obtain the qvalues
Tell = subprocess.call("/home/szfeupe/projects/GTEX_eSTRs/gtex-estrs/Scripts/fdr-correct.r")
Qval=pd.read_csv('/home/szfeupe/projects/GTEX_eSTRs/gtex-estrs/Scripts/qvalues.txt', sep=' ')
print (Qval.shape, '\t', Out_Lin_reg.shape)
#Add output from QVALUE
Out_Lin_reg['pval_test']=list(Qval['pvalue'])
Out_Lin_reg['qvalue']=list(Qval['qvalue'])
Out_Lin_reg['significant']=list(Qval['significant'])
#Organize and save
col=['gene', 'chrom', 'str.id', 'str.start', 'NTest', 'p.wald', 'qvalue', 'significant', 'beta','beta.se', 'PVAL', 'pval_test', 'af.dummy', 'allele1.dummy', 'allele2.dummy','lambda.remel', 'n.miss']
Out_Lin_reg=Out_Lin_reg[col]
Out_Lin_reg.to_csv(Input+'PQValues.txt', sep='\t', index=False)
#Summarize it
print("FDR correction summary: \neSTRs counts\t Treshold")
print(len(Qval[Qval['qvalue'] <=0.1]),'\t qval<=0.1')
print(len(Qval[Qval['qvalue'] <=0.05]),'\t qval<=0.05')
print(len(Qval[Qval['qvalue'] <=0.01]),'\t qval<0.01')

(14429, 6) 	 (14429, 15)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


FDR correction summary: 
eSTRs counts	 Treshold
1312 	 qval<=0.1
987 	 qval<=0.05
628 	 qval<0.01


In [None]:
genes1=['ENSG00000115129.9','ENSG00000018280.12','ENSG00000164692.13','ENSG00000146648.11','ENSG00000111537.4','ENSG00000149948.9','ENSG00000007171.12','ENSG00000100985.7','ENSG00000100292.12']
tes = Out_Lin_reg.loc[Out_Lin_reg['gene'].isin(genes1)]
print (tes[['gene','p.wald','beta', 'qvalue']])

In [None]:
plt.hist(PVAL, label='pvalues')
plt.hist(Out_Lin_reg['qvalue'], label='qvalues')
plt.show()
plt.hist(Out_Lin_reg['qvalue'], label='qvalues')
print(len(PVAL[PVAL>=1]))
print(len(PVAL))
plt.show()

In [None]:
2- FDR correction by chromosome
	1 we are performing a gene level correction
for a given gene we count the number N of tested STRs
we identify the STR test with smallest pvalue
that value is then adjusted by the number of test -> pvalue x N
then we use the QVALUE package to adjust the pvalues and obtain the qvalues
We then pick a threshold eg 0.05 to identify genes with eSTRs
Te we will perform the fine mapping that is where we actually make sure the STR tested in 
question are the real eSTRs and not the ones left off in the selection made above.
    genes.append(gene['gene.id'])
        strid.append(lowpval.loc[lowpval['str.id']])
        Nbr_test.append(N)
        pvalue.append(lowpval.loc[lowpval['p.wald']])
        
        
        
index=['0']
Out_Lin_reg = pd.DataFrame(index=index, columns=linreg.columns)
Out_Lin_reg = Out_Lin_reg.fillna(0)
Out_Lin_reg['PVAL']=0
Test=[]        #Number of STRs tested for a gene
Notest=[]      #Gene twith no tested genes
genes=[]; strid=[]; pvalue=[]; Nbr_test=[]
#For each gene in the set, count the number N of tested STRs
for index, gene in annot.iterrows():
    geneid = gene['gene.id']
    start = gene['gene.start']
    stop = gene['gene.stop']
#    strs = linreg.loc[linreg['str.start'].isin(range(start,stop))]
    strs = linreg.loc[linreg['gene'].isin([geneid])]
    N = len(strs)
    #print (N,' ',gene['gene.id'], '\t', len(list(strs['p.wald'])),'\t', min(strs['p.wald']))
    
#identifying the STR test with smallest pvalue
    if N>0 :
        ind_low = strs.loc[strs['p.wald'].idxmin()]
        lowpval = strs.loc[[ind_low.name]]
        #print (strs['p.wald'], '\n\n', lowpval['p.wald'])
#        print (N,' ',gene['gene.id'], '\t', len(list(strs['p.wald'])),'\t', min(strs['p.wald']))
#adjusting by the number of tests
        Adj_pval = lowpval['p.wald']*N
        lowpval['PVAL'] = Adj_pval
#        print (gene['gene.id'], '\t', lowpval['gene'],'\t',N)
    
#Append adjusted STR in the output for final FDR adjustment
        Out_Lin_reg.loc[len(Out_Lin_reg)] = lowpval.values.tolist()[0]
        Test.append(N)
    
    else:
        Notest.append(gene['gene.id']) 
        #print(gene['gene.id'],'----------------------------------')

Out_Lin_reg = Out_Lin_reg.drop('0')
print(len(Notest), ' genes were not tested for eSTRs')
print(len(Test), ' genes were tested for eSTRs')

In [None]:
p = [1,2,3]
q = [0,9,8]
p+q