In [54]:
import math
import numpy as np
import os
import pandas as pd
import random
import shutil
import sys
import gzip

def PROGRESS(msg, printit=True):
    if printit: # false for some messages when not in debug mode
        sys.stderr.write("%s\n"%msg.strip())
        
def MakeZScoreTable(vals):
    try:
        Zscore=vals['beta']/vals['beta.se']
    except:
        Zscore=None
    return Zscore

def WriteCorrTable(indexed_genotypes):
    """ generate correlation table using normalized genotype
      _1_ ... _n_
    1| 1  ... C1n
    .|    ...
    n|Cn1 ... Cnn=1
    """
    G=indexed_genotypes.transpose()
    variants = list(G.columns)
    CMat=[]
    print ('\t\t\t**', len(variants))
    for V1 in variants:
        COV=[]
        for V2 in variants:
            X=G[V1].replace('None', np.nan).astype(float)
            Y=G[V2].replace('None', np.nan).astype(float)
            #COV.append(X.corr(Y))
            if X.corr(Y) is np.nan:    #### 
                COV.append(0.0) #For missing LD we assume non linear corr (undetermined LD)
            else:
                COV.append(X.corr(Y))
        CMat.append(COV)
    return pd.DataFrame(CMat,columns=variants, index=variants) 

PATH = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Adipose-Subcutaneous/'
EXPRFILE = PATH+'Corr_Expr.csv'
EXPRANNOTFILE = '/storage/resources/dbase/human/hg19/gencode_gene_annotations_hg19.csv'
CHROM = 22
if "chr" not in str(CHROM): CHROM="chr%s"%CHROM
DISTFROMGENE = 100000
STRGTFILE = '/storage/szfeupe/Runs/650GTEx_estr/Genotypes/GenotypesNormalized.table'
SNPGTFILE = '/storage/szfeupe/Runs/650GTEx_estr/SNP_Analysis/chr22.tab'
OUTFILE = 'help.tab'
REG_STRs = PATH+'/Lin_Reg_Out'
REG_SNPs = PATH+'SNP_Analysis/Lin_Reg_Out'
TMPDIR = PATH+'HH/caviar_temps/'
if not os.path.exists(TMPDIR):
    os.mkdir(TMPDIR)
DEBUG =True
ESTRGENESFILE=None

In [None]:
    # Load expression
    DEBUG =True
    PROGRESS("\nLoad expression", printit=DEBUG)
    expr = pd.read_csv(EXPRFILE)
    samples_to_keep = list(expr.index)
    # Load annotation
    PROGRESS("Load annotation", printit=DEBUG)
    expr_annot = pd.read_csv(EXPRANNOTFILE)
    expr_annot.index = expr_annot["probe.id"].values
    expr_annot = expr_annot.reindex(list(expr.columns))
    expr_annot = expr_annot.dropna() 
    expr_annot = expr_annot[expr_annot["gene.chr"] == CHROM]
    # Load strs Regression
    PROGRESS("\nLoad strs regression", printit=DEBUG)
    strs = pd.read_csv(REG_STRs, sep="\t")
    strs = strs.loc[strs['chrom']==CHROM]
    # Load snps regression
    PROGRESS("\nLoad snps regression", printit=DEBUG)
    snps = pd.read_csv(REG_SNPs, sep="\t")
    snps = snps.loc[snps['chrom']==CHROM]
    del snps['Unnamed: 0']
    #Load SNP genotypes
    PROGRESS("Load SNPs", printit=DEBUG)
    snpgt = pd.read_csv(SNPGTFILE, sep="\t",low_memory=False)
    snpgt = snpgt.loc[snpgt['chrom']==CHROM]
    # Load STR genotypes
    PROGRESS("Load STRs", printit=DEBUG)
    strgt = pd.read_csv(STRGTFILE, sep="\t")
    strgt = strgt.loc[strgt['chrom']==CHROM]
    # Restrict to STR samples
    PROGRESS("Restrict to STRs samples", printit=DEBUG)
    str_samples = samples_to_keep
    expr = expr.reindex(str_samples)
    snpgt = snpgt[["chrom","start"] + str_samples]
    snpgt.index = list(snpgt["start"].apply(lambda x: "SNP_%s"%int(x)))
    strgt = strgt[["chrom","start"] + str_samples]
    strgt.index = list(strgt["start"].apply(lambda x: "STR_%s"%int(x)))
    # Load eSTR results
    PROGRESS("Restrict to eSTR genes only", printit=DEBUG)
    if ESTRGENESFILE is not None:
        estr_genes = pd.read_csv(ESTRGENESFILE, sep="\t")
        Genes = estr_genes.loc[estr_genes['qvalue']<=0.1]['gene']  # estrs at 10%FDR
        expr_annot = expr_annot.loc[expr_annot['gene.id'].isin(list(Genes))]
    print (expr_annot.shape)
    #open output files
    Errorfile = open(TMPDIR+"/Errorfile.out", 'w')
    OUT = open(OUTFILE, "w")
    OUT.write("\t".join(["chrom", "gene", "best.str.start", "best.str.score", "top.variant", "top.variant.score","top.snp.score"])+'\n')
    # For each gene, get all cis-variants and the best STR
    for i in range(expr_annot.shape[0]):
        gene=expr_annot.index.values[i]
        ensgene = expr_annot["gene.id"].values[i]  #'ENSG00000215912.7'
        genedir=TMPDIR+"/%s"%gene
        if not os.path.exists(genedir):
            os.mkdir(genedir)
        PROGRESS("Getting data for %s"%gene, printit=DEBUG)
        start = expr_annot["gene.start"].values[i]
        end = expr_annot["gene.stop"].values[i]
    # Pull out cis SNPs
        PROGRESS("Getting cis SNPs for %s"%gene)
        cis_snps = snps[(snps["str.start"] >= (start-DISTFROMGENE)) & (snps["str.start"] <= (end+DISTFROMGENE))]
        #print cis_snps.shape , '###'
        cis_snps = cis_snps.loc[cis_snps["str.start"].isin(list(snpgt["start"]))]  ###
        cis_variants = cis_snps.loc[cis_snps['gene']==ensgene]
        cis_variants=cis_variants.sort_values(by="p.wald").head(n=100)
        cis_variants.index = cis_variants["str.start"].apply(lambda x: "SNP_%s"%int(x))
        L=list(cis_variants.index)
    # Pull out cis STR
        PROGRESS("Getting most significant cis STR for %s"%gene)
        cis_strs = strs[strs["gene"]==ensgene].sort_values("p.wald")
        if cis_strs.shape[0]==0:
            PROGRESS("There are no STRs found for %s... Gene not in LR table"%gene)
            continue
        else: 
            cis_strs.index = cis_strs["str.start"].apply(lambda x: "STR_%s"%int(x)) 
            L0 = list(cis_strs.index)
        #
        cis_variants = pd.concat([cis_snps, cis_strs])
    # Make z file data
        Ztable = MakeZScoreTable(cis_variants[['beta','beta.se']])
        if Ztable is None:
            Errorfile.write(gene+": Z score could not be calculated; beta.se is probably 0 or null\n")
            continue
        else:
            Ztable.to_csv(genedir+'/ZFILE', sep='\t',header=None)
    # Make LD file
        genotypes = snpgt.loc[L]
        genotypes = pd.concat([genotypes, strgt.loc[L0] ])
        del genotypes['chrom']
        del genotypes['start']
        CorrMatrix = WriteCorrTable(genotypes)
        CorrMatrix.to_csv(genedir+'/LDFILE', sep='\t',header=None, index=None)
        PROGRESS("Matrix of corr was sent to file for %s"%gene)        
    #Run caviar
        caviar_cmd = "CAVIAR -l %s -z %s -o %s/caviar -c 1 -f 1 > %s"%(genedir+"/LDFILE", genedir+"/ZFILE", genedir, genedir+"/log")
        os.system(caviar_cmd)
    #Output results
        if not os.path.exists(genedir+'/caviar_post'):
            Errorfile.write(gene+": CAVIAR did not run.\n\tERROR: Segmentation fault (core dumped) in log file\n")
            continue
        else:
            post = pd.read_csv(genedir+'/caviar_post', sep="\t", header=None)    

In [55]:
    # For each gene, get all cis-variants and the best STR
    for i in range(expr_annot.shape[0]):
        gene=expr_annot.index.values[i]
        ensgene = expr_annot["gene.id"].values[i]  #'ENSG00000215912.7'
        genedir=TMPDIR+"/%s"%gene
        if not os.path.exists(genedir):
            os.mkdir(genedir)
        PROGRESS("Getting data for %s"%gene, printit=DEBUG)
        start = expr_annot["gene.start"].values[i]
        end = expr_annot["gene.stop"].values[i]
    # Pull out cis SNPs
        PROGRESS("Getting cis SNPs for %s"%gene)
        cis_snps = snps[(snps["str.start"] >= (start-DISTFROMGENE)) & (snps["str.start"] <= (end+DISTFROMGENE))]
        #print cis_snps.shape , '###'
        cis_snps = cis_snps.loc[cis_snps["str.start"].isin(list(snpgt["start"]))]  ###
        cis_variants = cis_snps.loc[cis_snps['gene']==ensgene]
        cis_snps=cis_variants.sort_values(by="p.wald").head(n=100)
        cis_snps.index = cis_snps["str.start"].apply(lambda x: "SNP_%s"%int(x))
        L=list(cis_snps.index)
    # Pull out cis STR
        PROGRESS("Getting most significant cis STR for %s"%gene)
        cis_strs = strs[strs["gene"]==ensgene].sort_values("p.wald")
        if cis_strs.shape[0]==0:
            PROGRESS("There are no STRs found for %s... Gene not in LR table"%gene)
            continue
        else: 
            cis_strs.index = cis_strs["str.start"].apply(lambda x: "STR_%s"%int(x))
            best_str_start = int(cis_strs["str.start"].values[0])
            L0 = list(cis_strs.index)
        #
        cis_variants = pd.concat([cis_snps, cis_strs])
    # Make z file data
        Ztable = MakeZScoreTable(cis_variants[['beta','beta.se']])
        if Ztable is None:
            Errorfile.write(gene+": Z score could not be calculated; beta.se is probably 0 or null\n")
            continue
        else:
            Ztable.to_csv(genedir+'/ZFILE', sep='\t',header=None)
    # Make LD file
        genotypes = snpgt.loc[L]
        genotypes = pd.concat([genotypes, strgt.loc[L0] ])
        del genotypes['chrom']
        del genotypes['start']
        CorrMatrix = WriteCorrTable(genotypes)
        CorrMatrix.to_csv(genedir+'/LDFILE', sep='\t',header=None, index=None)
        PROGRESS("Matrix of corr was sent to file for %s"%gene)        
    #Run caviar
        caviar_cmd = "CAVIAR -l %s -z %s -o %s/caviar -c 1 -f 1 > %s"%(genedir+"/LDFILE", genedir+"/ZFILE", genedir, genedir+"/log")
        os.system(caviar_cmd)
    #Output results
        if not os.path.exists(genedir+'/caviar_post'):
            Errorfile.write(gene+": CAVIAR did not run.\n\tERROR: Segmentation fault (core dumped) in log file\n")
            continue
        else:
            post = pd.read_csv(genedir+'/caviar_post', sep="\t", header=None)
            #caviarstr  =  post.loc[post[0]=='STR_'+str(best_str_start)][2].tolist()[0]
            #topvariant =  post.sort_values(post.columns[2], ascending=False).values[0][0]
            #topscore  =  post.sort_values(post.columns[2], ascending=False).values[0][2]
            
        
        print(gene, '\tExpression annotation\t', start, end)
        print('top 100 cis_snps\t', [L[0], L[1],'...',L[-1]],'\t ',len(L), cis_snps.shape)
        print('All cis strs\t', cis_strs.shape , '#str#') 
        print('All cis variants\t', cis_variants.shape)
        break


Getting data for ENSG00000177663.9
Getting cis SNPs for ENSG00000177663.9
Getting most significant cis STR for ENSG00000177663.9


			** 112
ENSG00000177663.9 	Expression annotation	 17565844.0 17596583.0
top 100 cis_snps	 ['SNP_17586609', 'SNP_17586715', '...', 'SNP_17569017'] 	  100 (100, 12)
All cis strs	 (12, 12) #str#
All cis variants	 (112, 12)


Matrix of corr was sent to file for ENSG00000177663.9


In [8]:

        else:
            post = pd.read_csv(genedir+'/caviar_post', sep="\t", header=None)
            print 'post','......', post    #.loc[post[0]=='STR_'+str(best_str_start)][2].tolist()
            #caviarstr  =  post.loc[post[0]=='STR_'+str(best_str_start)][2].tolist()[0]
            #topvariant =  post.sort_values(post.columns[2], ascending=False).values[0][0]
            #topscore  =  post.sort_values(post.columns[2], ascending=False).values[0][2]
            #if 'STR' in topvariant:
            #    snpscore  =  post.sort_values(post.columns[2], ascending=False).values[1][2]
            #else:
            #    snpscore=topscore
            #OUT.write("\t".join([CHROM, gene, str(best_str_start), str(caviarstr), topvariant, str(topscore), str(snpscore)])+'\n')
        O=0
        break
    OUT.close()
    Errorfile.close()


Load expression


Index(['GTEX-1117F', 'GTEX-111FC', 'GTEX-111VG', 'GTEX-111YS', 'GTEX-1122O',
       'GTEX-117YX', 'GTEX-11DXZ', 'GTEX-11DZ1', 'GTEX-11EM3', 'GTEX-11EQ8',
       ...
       'GTEX-ZVZQ', 'GTEX-ZXES', 'GTEX-ZXG5', 'GTEX-ZYFC', 'GTEX-ZYFD',
       'GTEX-ZYVF', 'GTEX-ZYW4', 'GTEX-ZYWO', 'GTEX-ZZ64', 'GTEX-ZZPU'],
      dtype='object', length=270)

In [62]:
post

Unnamed: 0,0,1,2
0,SNP_17586609,7.649090e-01,7.649090e-01
1,SNP_17586715,1.064020e-01,1.064020e-01
2,SNP_17586631,1.048040e-01,1.048040e-01
3,SNP_17586583,2.388540e-02,2.388540e-02
4,SNP_17602164,7.068190e-11,7.068190e-11
5,SNP_17595915,4.266680e-11,4.266680e-11
6,SNP_17595929,4.266680e-11,4.266680e-11
7,SNP_17591089,2.184990e-11,2.184990e-11
8,SNP_17596322,1.692930e-11,1.692930e-11
9,SNP_17601466,5.988680e-12,5.988680e-12


In [64]:
            topvariants =  post.sort_values(post.columns[2], ascending=False).values[:5][0]
            topscores  =  post.sort_values(post.columns[2], ascending=False).values[:5][2]
            topcaviarstr  =  post.loc[post[0]=='STR_'+str(best_str_start)][2].tolist()[0]
            print(topvariants, '\n', topscores, '\n', topcaviarstr)

['SNP_17586609' 0.7649090000000001 0.7649090000000001] 
 ['SNP_17586631' 0.104804 0.104804] 
 4.1156000000000005e-17
