In [1]:
import math
import numpy as np
import os
import pandas as pd
import random
import shutil
import sys
import gzip

def PROGRESS(msg, printit=True):
    if printit: # false for some messages when not in debug mode
        sys.stderr.write("%s\n"%msg.strip())
        
def MakeZScoreTable(vals):
    try:
        Zscore=vals['beta']/vals['beta.se']
    except:
        Zscore=None
    return Zscore

def WriteCorrTable(indexed_genotypes):
    """ generate correlation table using normalized genotype
      _1_ ... _n_
    1| 1  ... C1n
    .|    ...
    n|Cn1 ... Cnn=1
    """
    G=indexed_genotypes.transpose()
    variants = list(G.columns)
    CMat=[]
    print ('\t\t\t**', len(variants))
    for V1 in variants:
        COV=[]
        for V2 in variants:
            X=G[V1].replace('None', np.nan).astype(float)
            Y=G[V2].replace('None', np.nan).astype(float)
            #COV.append(X.corr(Y))
            if X.corr(Y) is np.nan:    #### 
                COV.append(0.0) #For missing LD we assume non linear corr (undetermined LD)
            else:
                COV.append(X.corr(Y))
        CMat.append(COV)
    return pd.DataFrame(CMat,columns=variants, index=variants) 

def lookfor (x,p):
    for i in range(1,len(p.index)):
        if x in p.values[i][0]:
            top = p.values[i][0]
            score = p.values[i][2]
            return i,top, score

PATH = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Esophagus-Mucosa/'
EXPRFILE = PATH+'Corr_Expr.csv'
EXPRANNOTFILE = '/storage/resources/dbase/human/hg19/gencode_gene_annotations_hg19.csv'
CHROM = 21
if "chr" not in str(CHROM): CHROM="chr%s"%CHROM
DISTFROMGENE = 100000
STRGTFILE = '/storage/szfeupe/Runs/650GTEx_estr/Genotypes/GenotypesNormalized.table'
SNPGTFILE = '/storage/szfeupe/Runs/650GTEx_estr/SNP_Analysis/chr21.tab'
OUTFILE = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Esophagus-Mucosa/HH/caviar.out21'
REG_STRs = PATH+'/Lin_Reg_Out'
REG_SNPs = PATH+'SNP_Analysis/Lin_Reg_Out'
TMPDIR = PATH+'HH/caviar_temps/'
if not os.path.exists(TMPDIR):
    os.mkdir(TMPDIR)
DEBUG =True
ESTRGENESFILE=None


# Load expression
DEBUG =True
PROGRESS("\nLoad expression", printit=DEBUG)
expr = pd.read_csv(EXPRFILE)
samples_to_keep = list(expr.index)
# Load annotation
PROGRESS("Load annotation", printit=DEBUG)
expr_annot = pd.read_csv(EXPRANNOTFILE)
expr_annot.index = expr_annot["probe.id"].values
expr_annot = expr_annot.reindex(list(expr.columns))
expr_annot = expr_annot.dropna() 
expr_annot = expr_annot[expr_annot["gene.chr"] == CHROM]
# Load strs Regression
PROGRESS("\nLoad strs regression", printit=DEBUG)
strs = pd.read_csv(REG_STRs, sep="\t")
strs = strs.loc[strs['chrom']==CHROM]
# Load snps regression
PROGRESS("\nLoad snps regression", printit=DEBUG)
snps = pd.read_csv(REG_SNPs, sep="\t")
snps = snps.loc[snps['chrom']==CHROM]
del snps['Unnamed: 0']
#Load SNP genotypes
PROGRESS("Load SNPs", printit=DEBUG)
snpgt = pd.read_csv(SNPGTFILE, sep="\t",low_memory=False)
snpgt = snpgt.loc[snpgt['chrom']==CHROM]
# Load STR genotypes
PROGRESS("Load STRs", printit=DEBUG)
strgt = pd.read_csv(STRGTFILE, sep="\t")
strgt = strgt.loc[strgt['chrom']==CHROM]
# Restrict to STR samples
PROGRESS("Restrict to STRs samples", printit=DEBUG)
str_samples = samples_to_keep
expr = expr.reindex(str_samples)
snpgt = snpgt[["chrom","start"] + str_samples]
snpgt.index = list(snpgt["start"].apply(lambda x: "SNP_%s"%int(x)))
strgt = strgt[["chrom","start"] + str_samples]
strgt.index = list(strgt["start"].apply(lambda x: "STR_%s"%int(x)))
# Load eSTR results
PROGRESS("Restrict to eSTR genes only", printit=DEBUG)
if ESTRGENESFILE is not None:
    estr_genes = pd.read_csv(ESTRGENESFILE, sep="\t")
    Genes = estr_genes.loc[estr_genes['qvalue']<=0.1]['gene']  # estrs at 10%FDR
    expr_annot = expr_annot.loc[expr_annot['gene.id'].isin(list(Genes))]
print (expr_annot.shape)
#open output files
Errorfile = open(TMPDIR+"/Errorfile.out", 'w')
OUT = open(OUTFILE, "w")
OUT.write("\t".join(['CHROM', 'gene', 'num.STRs.in.top5', 'top_snp', 'top_snp_score', 'top_str', 'top.str.score'])+'\n')

Load expression
Load annotation
Load strs regression
Load snps regression
Load SNPs
Load STRs
Restrict to STRs samples


(164, 10)


Restrict to eSTR genes only


72

In [None]:
    OUT.close()
    O=0
    OUT = open(OUTFILE, "w")
    OUT.write("\t".join(['CHROM', 'gene', 'num.STRs.in.top5', 'top_snp', 'top_snp_score', 'top_str', 'top.str.score'])+'\n')
    # For each gene, get all cis-variants and the best STR
    for i in range(0, expr_annot.shape[0]):
        gene=expr_annot.index.values[i]
        ensgene = expr_annot["gene.id"].values[i]  #'ENSG00000215912.7'
        genedir=TMPDIR+"/%s"%gene
        if not os.path.exists(genedir):
            os.mkdir(genedir)
        clear_cmd = "rm "+genedir+'/*'
        os.system(clear_cmd)
        #PROGRESS("Getting data for %s"%gene, printit=DEBUG)
        start = expr_annot["gene.start"].values[i]
        end = expr_annot["gene.stop"].values[i]
    # Pull out cis SNPs
        #PROGRESS("Getting cis SNPs for %s"%gene)
        cis_snps = snps[(snps["str.start"] >= (start-DISTFROMGENE)) & (snps["str.start"] <= (end+DISTFROMGENE))]
        cis_snps = cis_snps.loc[cis_snps['gene']==ensgene]
        cis_snps.index = cis_snps["str.start"].apply(lambda x: "SNP_%s"%int(x))
        print (gene, cis_snps.shape , '##SNPs#')
        cis_variants = cis_snps.loc[cis_snps["str.start"].isin(list(snpgt["start"]))]  ###        
        cis_snps=cis_variants.sort_values(by="p.wald").head(n=100)
        cis_snps.index = cis_snps["str.start"].apply(lambda x: "SNP_%s"%int(x))
        L=list(cis_snps.index); print(len(L))

    # Pull out cis STR
        #PROGRESS("Getting most significant cis STR for %s"%gene)
        cis_strs = strs[strs["gene"]==ensgene].sort_values("p.wald")
        if cis_strs.shape[0]==0:
            PROGRESS("There are no STRs found for %s... Gene not in LR table"%gene)
            continue
        else: 
            cis_strs.index = cis_strs["str.start"].apply(lambda x: "STR_%s"%int(x))
            best_str_start = int(cis_strs["str.start"].values[0])
            L0 = list(cis_strs.index)
        #
        cis_variants = pd.concat([cis_snps, cis_strs])
        print(len(L), len(L0), len(set(L0)),  cis_variants.shape)
    # Make z file data
        Ztable = MakeZScoreTable(cis_variants[['beta','beta.se']])
        if Ztable is None:
            Errorfile.write(gene+": Z score could not be calculated; beta.se is probably 0 or null\n")
            continue
        else:
            Ztable.to_csv(genedir+'/ZFILE', sep='\t',header=None)
    # Make LD file
        genotypes = snpgt.loc[L]
        genotypes = pd.concat([genotypes, strgt.loc[L0] ])
        del genotypes['chrom']
        del genotypes['start']
        CorrMatrix = WriteCorrTable(genotypes)
        CorrMatrix.to_csv(genedir+'/LDFILE', sep='\t',header=None, index=None)
        PROGRESS("Matrix of corr was sent to file for %s"%gene)        
    #Run caviar
        caviar_cmd = "CAVIAR -l %s -z %s -o %s/caviar -c 1 -f 1 > %s"%(genedir+"/LDFILE", genedir+"/ZFILE", genedir, genedir+"/log")
        os.system(caviar_cmd)
    #Output results
        if not os.path.exists(genedir+'/caviar_post'):
            Errorfile.write(gene+": CAVIAR did not run.\n\tERROR: Segmentation fault (core dumped) in log file\n")
            continue
        else:
            post = pd.read_csv(genedir+'/caviar_post', sep="\t", header=None)
            post = post.sort_values(post.columns[2], ascending=False)
            p = post.head(5)
            print (p[0].shape)
            
            num_str = len([x for x in list(p[0].values) if "STR_" in x])
            if 'STR_' in p[0][0]:
                topstr = p[0][0]
                topstrscore = p.values[0][2]
                I,topsnp, topsnpscore =lookfor('SNP_', post)
            else:
                topsnp = p[0][0]
                topsnpscore = p.values[0][2]
                I, topstr , topstrscore =lookfor('STR_',post)
            OUT.write("\t".join([CHROM, gene, str(num_str),topsnp,str(topsnpscore),str(topstr), str(topstrscore),str(I+1)])+'\n')
            #print("\t".join([CHROM, gene, str(num_str),topsnp,str(topsnpscore),str(topstr), str(topstrscore),str(I+1)])+'\n')
            strsscores = post.loc[post[0].isin(cis_strs.index)][[0,2]]
            strsscores['chrom']=[CHROM]*strsscores.shape[0]
            strsscores['gene']= [gene]*strsscores.shape[0]
            strsscores.columns = ['str','score', 'chrom', 'gene']
            with open('strs_core'+CHROM, 'a') as k:
                (strsscores[['chrom', 'gene','str','score']]).to_csv(k, header=False, index=False,sep='\t')  
        O=0+1                  
        if O==6:
            break

In [22]:
cis_strs

Unnamed: 0_level_0,gene,chrom,str.id,str.start,n.miss,allele1.dummy,allele2.dummy,af.dummy,beta,beta.se,lambda.remel,p.wald
str.start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
STR_15482659,ENSG00000188992.7,chr21,STR_15482659,15482659,14,A,G,0,-0.157865,0.063874,-1,0.014152
STR_15626893,ENSG00000188992.7,chr21,STR_15626893,15626893,5,A,G,0,0.150323,0.062779,-1,0.017387
STR_15569348,ENSG00000188992.7,chr21,STR_15569348,15569348,30,A,G,0,0.157689,0.066127,-1,0.017934
STR_15664856,ENSG00000188992.7,chr21,STR_15664856,15664856,7,A,G,0,0.093743,0.063477,-1,0.141007
STR_15497316,ENSG00000188992.7,chr21,STR_15497316,15497316,12,A,G,0,-0.093863,0.064131,-1,0.144606
STR_15497316,ENSG00000188992.7,chr21,STR_15497316,15497316,12,A,G,0,-0.093863,0.064131,-1,0.144606
STR_15631144,ENSG00000188992.7,chr21,STR_15631144,15631144,4,A,G,0,0.088816,0.063122,-1,0.160657
STR_15615546,ENSG00000188992.7,chr21,STR_15615546,15615546,46,A,G,0,0.0777,0.069295,-1,0.263463
STR_15488331,ENSG00000188992.7,chr21,STR_15488331,15488331,3,A,G,0,-0.06831,0.063098,-1,0.280024
STR_15479065,ENSG00000188992.7,chr21,STR_15479065,15479065,15,A,G,0,-0.064823,0.064684,-1,0.31729


In [None]:
    # For each gene, get all cis-variants and the best STR
    for i in range(expr_annot.shape[0]):
        gene=expr_annot.index.values[i]
        ensgene = expr_annot["gene.id"].values[i]  #'ENSG00000215912.7'
        genedir=TMPDIR+"/%s"%gene
        if not os.path.exists(genedir):
            os.mkdir(genedir)
        clean_cmd='rm '+
        os.system(clean_cmd)
        PROGRESS("Getting data for %s"%gene, printit=DEBUG)
        start = expr_annot["gene.start"].values[i]
        end = expr_annot["gene.stop"].values[i]
    # Pull out cis SNPs
        PROGRESS("Getting cis SNPs for %s"%gene)
        cis_snps = snps[(snps["str.start"] >= (start-DISTFROMGENE)) & (snps["str.start"] <= (end+DISTFROMGENE))]
        #print cis_snps.shape , '###'
        cis_snps = cis_snps.loc[cis_snps["str.start"].isin(list(snpgt["start"]))]  ###
        cis_variants = cis_snps.loc[cis_snps['gene']==ensgene]
        cis_variants=cis_variants.sort_values(by="p.wald").head(n=100)
        cis_variants.index = cis_variants["str.start"].apply(lambda x: "SNP_%s"%int(x))
        L=list(cis_variants.index)
    # Pull out cis STR
        PROGRESS("Getting most significant cis STR for %s"%gene)
        cis_strs = strs[strs["gene"]==ensgene].sort_values("p.wald")
        if cis_strs.shape[0]==0:
            PROGRESS("There are no STRs found for %s... Gene not in LR table"%gene)
            continue
        else: 
            cis_strs.index = cis_strs["str.start"].apply(lambda x: "STR_%s"%int(x)) 
            L0 = list(cis_strs.index)
        #
        cis_variants = pd.concat([cis_snps, cis_strs])
    # Make z file data
        Ztable = MakeZScoreTable(cis_variants[['beta','beta.se']])
        if Ztable is None:
            Errorfile.write(gene+": Z score could not be calculated; beta.se is probably 0 or null\n")
            continue
        else:
            Ztable.to_csv(genedir+'/ZFILE', sep='\t',header=None)
    # Make LD file
        genotypes = snpgt.loc[L]
        genotypes = pd.concat([genotypes, strgt.loc[L0] ])
        del genotypes['chrom']
        del genotypes['start']
        CorrMatrix = WriteCorrTable(genotypes)
        CorrMatrix.to_csv(genedir+'/LDFILE', sep='\t',header=None, index=None)
        PROGRESS("Matrix of corr was sent to file for %s"%gene)        
    #Run caviar
        caviar_cmd = "CAVIAR -l %s -z %s -o %s/caviar -c 1 -f 1 > %s"%(genedir+"/LDFILE", genedir+"/ZFILE", genedir, genedir+"/log")
        os.system(caviar_cmd)
    #Output results
    #Output results
        if not os.path.exists(genedir+'/caviar_post'):
            Errorfile.write(gene+": CAVIAR did not run.\n\tERROR: Segmentation fault (core dumped) in log file\n")
            continue
        else:
            post = pd.read_csv(genedir+'/caviar_post', sep="\t", header=None)
            p = post.head(5)
            num_str = len([x for x in list(p[0].values) if "STR_" in x])
            if 'STR_' in p[0][0]:
                topstr = p[0][0]
                topstrscore = p.values[0][2]
                I,topsnp, topsnpscore =lookfor('SNP_', post)
            else:
                topsnp = p[0][0]
                topsnpscore = p.values[0][2]
                I, topstr , topstrscore =lookfor('STR_',post)
            OUT.write("\t".join([CHROM, gene, str(num_str), topsnp, str(topsnpscore), str(topstr), str(topstrscore)])+'\n')
            print("**\t".join([CHROM, gene, str(num_str), topsnp, str(topsnpscore), str(topstr), str(topstrscore), str(I)])+'\n')

    OUT.close()

In [28]:
p.reindex()

Unnamed: 0,0,1,2
100,STR_126162,0.030105,0.012175
0,SNP_82865,0.025945,0.010493
1,SNP_126890,0.021431,0.008667
2,SNP_126888,0.019044,0.007702
3,SNP_143559,0.016886,0.006829


In [22]:
#I, topstr , topstrscore =lookfor('STR_',post)
print(lookfor('STR_',post))
#1 SNP_82865 0.0104928

None


In [None]:
#OLD LOOP
    for i in range(expr_annot.shape[0]):
        gene=expr_annot.index.values[i]
        ensgene = expr_annot["gene.id"].values[i]  #'ENSG00000215912.7'
        genedir=TMPDIR+"/%s"%gene
        if not os.path.exists(genedir):
            os.mkdir(genedir)        
        clear_cmd = "rm "+genedir+'/*'
        os.system(clear_cmd)
        PROGRESS("Getting data for %s"%gene, printit=DEBUG)
        start = expr_annot["gene.start"].values[i]
        end = expr_annot["gene.stop"].values[i]
    # Pull out cis SNPs
        PROGRESS("Getting cis SNPs for %s"%gene)
        cis_snps = snps[(snps["str.start"] >= (start-DISTFROMGENE)) & (snps["str.start"] <= (end+DISTFROMGENE))]
        cis_snps = cis_snps.loc[cis_snps['gene']==ensgene]
        cis_snps.index = cis_snps["str.start"].apply(lambda x: "SNP_%s"%int(x))
        print cis_snps.shape , '##SNPs#'
        cis_variants = cis_snps.loc[cis_snps["str.start"].isin(list(snpgt["start"]))]  ###        
        #cis_variants = cis_snps.loc[cis_snps['gene']==ensgene]
        cis_snps=cis_variants.sort_values(by="p.wald").head(n=100)
        #cis_snps.index = cis_snps["str.start"].apply(lambda x: "SNP_%s"%int(x))
        L=list(cis_snps.index)
    # Pull out cis STR
        PROGRESS("Getting all cis STR for %s"%gene)
        cis_strs = strs[strs["gene"]==ensgene].sort_values("p.wald")
        if cis_strs.shape[0]==0 :
            PROGRESS("There are no STRs found for %s... Gene not in LR table"%gene)
            continue
        elif cis_snps.shape[0]<=1:
            PROGRESS("There are no or not enough SNPs found for %s... Gene not in LR table"%gene)
            continue
        else: 
            cis_strs.index = cis_strs["str.start"].apply(lambda x: "STR_%s"%int(x)) 
            L0 = list(cis_strs.index)
        #
        cis_variants = pd.concat([cis_snps, cis_strs])
        print len(L), len(L0), cis_variants.shape
    # Make z file data
        Ztable = MakeZScoreTable(cis_variants[['beta','beta.se']])
        #print cis_variants[['beta','beta.se']]
        if Ztable is None:
            Errorfile.write(gene+": Z score could not be calculated; beta.se is probably 0 or null\n")
            continue
        else:
            Ztable.to_csv(genedir+'/ZFILE', sep='\t',header=None)
    # Make LD file
        
        genotypes = snpgt.loc[L]
        genotypes = pd.concat([genotypes, strgt.loc[L0] ])
        del genotypes['chrom']
        del genotypes['start']
        CorrMatrix = WriteCorrTable(genotypes)
        CorrMatrix.to_csv(genedir+'/LDFILE', sep='\t',header=None, index=None)
        PROGRESS("Matrix of corr for %s LDFILE %s ZFILE %s "%(gene, str(CorrMatrix.shape), str(Ztable.shape)))        
    #Run caviar
        caviar_cmd = "CAVIAR -l %s -z %s -o %s/caviar -c 1 -f 1 > %s"%(genedir+"/LDFILE", genedir+"/ZFILE", genedir, genedir+"/log")
        os.system(caviar_cmd)
    #Output results
        PROGRESS(genedir)
        if not os.path.exists(genedir+'/caviar_post'):
            Errorfile.write(gene+": CAVIAR did not run.\n\tERROR: Segmentation fault (core dumped) in log file\n")
            continue
        else:
            post = pd.read_csv(genedir+'/caviar_post', sep="\t", header=None)
            post = post.sort_values(post.columns[2], ascending=False)
            p = post.head(5)
            print 'top 5 ranked variants', p.shape
            num_str = len([x for x in list(p[0].values) if "STR_" in x])
            if 'STR_' in p[0][0]:
                topstr = p[0][0]
                topstrscore = p.values[0][2]
                I,topsnp, topsnpscore =lookfor('SNP_', post)
            else:
                topsnp = p[0][0]
                topsnpscore = p.values[0][2]
                I, topstr , topstrscore =lookfor('STR_',post)
            OUT.write("\t".join([CHROM, gene, str(num_str),topsnp,str(topsnpscore),str(topstr), str(topstrscore),str(I+1)])+'\n')
            strsscores = post.loc[post[0].isin(cis_strs.index)][[0,2]]
            strsscores['chrom']=[CHROM]*strsscores.shape[0]
            strsscores['gene']= [gene]*strsscores.shape[0]
            strsscores.columns = ['str','score', 'chrom', 'gene']
            with open('strs_core'+CHROM, 'a') as k:
                (strsscores[['chrom', 'gene','str','score']]).to_csv(k, header=False, index=False,sep='\t')  
            O=0+1
        #break

