In [1]:
import math
import numpy as np
import os
import pandas as pd
import random
import sys
from scipy.stats.stats import pearsonr

"""
Quantify the probability of a variant to be causal while allowing with arbitrary number of causal variants
"""

def PROGRESS(msg, printit=True):
    if printit: # false for some messages when not in debug mode
        sys.stderr.write("%s\n"%msg.strip())
        
def z(vals):
    vals['z.score']=vals['beta']/vals['beta.se']
    return vals['z.score']

def WriteCorrTable(indexed_genotypes):
    """ generate correlation table using normalized genotype
      _1_ ... _n_
    1| 1  ... C1n
    .|    ...
    n|Cn1 ... Cnn=1
    """
    G=indexed_genotypes.transpose()
    variants = list(G.columns)
    frames=[]
    len(set(variants))
    for V1 in variants:
        COV=[]
        for V2 in variants:
            X=G[V1].replace('None', np.nan).astype(float)
            Y=G[V2].replace('None', np.nan).astype(float)
            COV.append(X.corr(Y))
        frames.append(COV)
    return pd.DataFrame(frames,columns=variants, index=variants) 
        


In [2]:
EXPRFILE = '/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/Corr_Expr.csv'
EXPRANNOTFILE = '~/projects/GTEX_eSTRs/data/Lin_Reg/Gene_Exp_Annotation.txt'
CHROM = 1
if "chr" not in str(CHROM): CHROM="chr%s"%CHROM
DISTFROMGENE = 10000
STRGTFILE = '/storage/szfeupe/Runs/GTEx_estr/Normalized_Genotypes/STR_Norm_Gen.chr1'
SNPGTFILE = '/storage/szfeupe/Runs/GTEx_estr/SNP_Analysis/SNP_raw_gt_chr1'
OUTFILE = '/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/OUT_caviar'
ESTRGENESFILE = '/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/PQValue.tsv'
REG_STRs = '/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/Lin_Reg_OutFin.txt'
REG_SNPs = '/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/SNP_Analysis/Lin_Reg_OutFin.txt'
TMPDIR = '/storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/tmp'
DEBUG = True

In [3]:
# Load expression
PROGRESS("\nLoad expression", printit=DEBUG)
expr = pd.read_csv(EXPRFILE)
    
# Load annotation
PROGRESS("Load annotation", printit=DEBUG)
expr_annot = pd.read_csv(EXPRANNOTFILE)
expr_annot.index = expr_annot["probe.id"].values
expr_annot = expr_annot.loc[list(expr.columns)].dropna() #expr_annot.loc[expr.columns[map(lambda x: x in expr_annot.index, expr.columns)],:]
expr_annot = expr_annot[expr_annot["gene.chr"] == CHROM]

# Load strs Regression
PROGRESS("\nLoad strs regression", printit=DEBUG)
strs = pd.read_csv(REG_STRs, sep="\t")
strs = strs.loc[strs['chrom']==CHROM]

# Load snps regression
PROGRESS("\nLoad snps regression", printit=DEBUG)
snps = pd.read_csv(REG_SNPs, sep="\t")
snps = snps.loc[snps['chrom']==CHROM]

#Load SNP genotypes
PROGRESS("Load SNPs", printit=DEBUG)
snpgt = pd.read_csv(SNPGTFILE, sep="\t")
snpgt = snpgt.loc[snpgt['chrom']==CHROM]

# Load STR genotypes
PROGRESS("Load STRs", printit=DEBUG)
strgt = pd.read_csv(STRGTFILE, sep="\t")
strgt = strgt.loc[strgt['chrom']==CHROM]

# Restrict to STR samples
PROGRESS("Restrict to STRs samples", printit=DEBUG)
str_samples = list(set(strgt.columns[2:].values).intersection(set(snpgt.columns[2:].values)))
expr = expr.loc[str_samples,:]
snpgt = snpgt[["chrom","start"] + str_samples]
snpgt.index = list(snpgt["start"].apply(lambda x: "SNP_%s"%int(x)))
strgt = strgt[["chrom","start"] + str_samples]
strgt.index = list(strgt["start"].apply(lambda x: "STR_%s"%int(x)))
samples_to_keep = str_samples

# Load eSTR results
PROGRESS("Restrict to eSTR genes only", printit=DEBUG)
if ESTRGENESFILE is not None:
    estr_genes = pd.read_csv(ESTRGENESFILE, sep="\t")
    Genes = estr_genes.loc[estr_genes['qvalue']<=0.1]['gene']  # estrs at 10%FDR
    expr_annot = expr_annot.loc[expr_annot['gene.id'].isin(list(Genes))]
print(expr_annot.shape, len(samples_to_keep), expr.shape, snps.shape, strs.shape, strgt.shape, snpgt.shape)

Load expression
Load annotation
Load strs regression
Load snps regression
Load SNPs
  interactivity=interactivity, compiler=compiler, result=result)
Load STRs
Restrict to STRs samples


(70, 10) 148 (148, 22454) (900916, 12) (22468, 13) (63612, 150) (513574, 150)


Restrict to eSTR genes only


In [4]:
# For each gene, get all cis-variants and the best STR
for i in range(expr_annot.shape[0]):
    gene=expr_annot.index.values[i]
    ensgene = expr_annot["gene.id"].values[i]
    genedir=TMPDIR+"/%s"%gene
    if not os.path.exists(genedir):
        os.mkdir(genedir)
    PROGRESS("Getting data for %s"%gene, printit=DEBUG)
    start = expr_annot["gene.start"].values[i]
    end = expr_annot["gene.stop"].values[i]
# Pull out cis SNPs
    PROGRESS("Getting cis SNPs for %s"%gene)
    cis_snps = snps[(snps["str.start"] >= (start-DISTFROMGENE)) & (snps["str.start"] <= (end+DISTFROMGENE))]
    #print('cis-snps1', cis_snps.shape)
    cis_snps = cis_snps.loc[cis_snps['gene']==ensgene]
    #print('cis-snps2', cis_snps.shape)
    cis_snps.index = cis_snps["str.start"].apply(lambda x: "SNP_%s"%int(x))
    L=list(cis_snps.index)
# Pull out most significant STR
    PROGRESS("Getting most significant cis STR for %s"%gene)
    best_str_start = strs[strs["gene"]==ensgene].sort("p.wald")["str.start"].values[0]
    cis_strs = strs.loc[strs['gene']==ensgene]
    cis_strs.index = list(cis_strs['str.id'])
    try:
        del cis_strs['ID']
    except:
        pass
    cis_snps.loc['STR_'+str(best_str_start)] = list(cis_strs.loc['STR_'+str(best_str_start)])
# Make z file data
    Z = z(cis_snps[['beta','beta.se']])
    Z.to_csv(genedir+'/ZFILE', sep='\t',header=None)
# Make LD file
    genotypes = snpgt.loc[L]
    genotypes.loc['STR_'+str(best_str_start)] = list(strgt.loc['STR_'+str(best_str_start)])
    del genotypes['chrom']
    del genotypes['start']
    Matrix = WriteCorrTable(genotypes)
    Matrix.to_csv(genedir+'/LDFILE', sep='\t',header=None, index=None)
    PROGRESS("Matrix of corr was sent to file for %s"%gene)
#Run caviar
    caviar_cmd = "CAVIAR -l %s -z %s -o %s/caviar -c 1 -f 1 > log"%(genedir +"/LDFILE", genedir+"/ZFILE", genedir)
    print(caviar_cmd)
    os.system(caviar_cmd)
#prep and record output
    if not os.path.exists(genedir+'/caviar_post'):
        print("......................CAVIAR did not run for ", gene)
        continue
        #Errorfile.write("CAVIAR did not run for "+gene+' \n\tERROR: Segmentation fault (core dumped) in log file')
    else:
        post = pd.read_csv(genedir+'/caviar_post', sep="\t", header=None)
        caviarstr  =  post.loc[post[0]=='STR_'+str(best_str_start)][2].tolist()[0]
        topvariant =  post.sort(2, ascending=False).values[0][0]
        topscore  =  post.sort(2, ascending=False).values[0][2]
        print("\t".join([CHROM,gene,str(best_str_start),str(caviarstr),topvariant,str(topscore)]))  
        break
    

Getting data for ENSG00000116198.8
Getting cis SNPs for ENSG00000116198.8
Getting most significant cis STR for ENSG00000116198.8
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Matrix of corr was sent to file for ENSG00000116198.8


CAVIAR -l /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/tmp/ENSG00000116198.8/LDFILE -z /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/tmp/ENSG00000116198.8/ZFILE -o /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/tmp/ENSG00000116198.8/caviar -c 1 -f 1 > log
......................CAVIAR did not run for  ENSG00000116198.8


Getting data for ENSG00000069424.10
Getting cis SNPs for ENSG00000069424.10
Getting most significant cis STR for ENSG00000069424.10
Matrix of corr was sent to file for ENSG00000069424.10


CAVIAR -l /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/tmp/ENSG00000069424.10/LDFILE -z /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/tmp/ENSG00000069424.10/ZFILE -o /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/tmp/ENSG00000069424.10/caviar -c 1 -f 1 > log
chr1	ENSG00000069424.10	6090509	323    0.05968
Name: 2, dtype: float64	SNP_6067261	0.0754638




In [28]:
post.sort(2, ascending=False).values[0][2]

  if __name__ == '__main__':


0.0754638

In [None]:

python ~/projects/GTEX_eSTRs/gtex-estrs/Scripts/GetCausalCAVIAR.py 
--expr /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/Corr_Expr.csv
--exprannot ~/projects/GTEX_eSTRs/data/Lin_Reg/Gene_Exp_Annotation.txt
--chrom 1
--distfromgene 10000
--strgt /storage/szfeupe/Runs/GTEx_estr/Normalized_Genotypes/STR_Norm_Gen.chr1
--snpgt /storage/szfeupe/Runs/GTEx_estr/SNP_Analysis/SNP_raw_gt_chr1
--out /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/OUT_caviar
--restrict_to_estrs /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/PQValue.tsv
--linreg_str /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/Lin_Reg_OutFin.txt
--linreg_snp /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/SNP_Analysis/Lin_Reg_OutFin.txt'
--tmpdir /storage/szfeupe/Runs/GTEx_estr/Analysis_by_Tissue/WholeBlood/HH/Testingcenter/tmp
--debug