In [43]:
#!/usr/bin/python2.7

import argparse
import math
import numpy as np
import os
import pandas as pd
import random
import shutil
import sys
import gzip

#sys.path.append("/san/melissa/workspace/str-qtl/lmm/")
#from LMMSimulationsUtils2 import *

"""
Analyze heritability of gene expression due to SNPs/STRs
Notes:
** The reported SE for STRs when treating the STR as fixed effect are the SE on *beta*, not on *beta^2* **
"""

GCTA_MIN_VE = 0.000001 # Lowest VE that GCTA reports when constrained

def PROGRESS(msg, printit=True):
    if printit: # false for some messages when not in debug mode
        sys.stderr.write("%s\n"%msg.strip())

def GetMAF(x):
    """
    Get SNP MAF
    """
    x=x.convert_objects(convert_numeric=True)
    vals = x.values
#    print(len(vals))
    maf = sum(vals)*1.0/(2*len(vals))
    return min([maf, 1-maf])

def GRM(snpdata):
    """
    Return n by n GRM, scaled to have mean diagonal element 1
    """
#    snpdata.apply(pd.to_numeric, errors='coerce')
    p, n = snpdata.shape
    print(p,'  ',n)
    K = np.zeros((n, n))
    for i in range(p):
#        print(snpdata.iloc[i,:])
        gt = snpdata.iloc[i,:].apply(lambda x: float(x))
        var = np.var(gt)
        #print(var)
        if var == 0: continue
        m = np.mean(gt)
        x = np.reshape((np.array(gt)-m).transpose(), (n, 1))
        gt_m = 1/(var*p)*x.dot(x.transpose())
        K = K + gt_m
#    print(K )
    # Make sure diagonal had mean 1 (should anyway)
    diag_mean = np.mean(np.diagonal(K))
#    print(np.diagonal(K))
    K = K/diag_mean
    return K

def WriteGCTAPhenotypeFile(locus, exprfile):
    """
    Write the phenotype file 
    """
    n=len(locus)
    f = open(exprfile, "w")
    for i in range(n):
        f.write(' '.join([str(i),str(i),str(locus[i]),'\n']))
    f.close()

def WriteGCTAGRM(K, grmfile, p):
    """
    Calculate GRM and output file in GCTA format (.gz)
    Need to write:
    $grmfile.grm.gz: ind1, ind2, num nonmissing SNPs, relatedness (space-separated) (indices start at 1, rows into $grmfile.ind)
      only includes lower triangle of GRM
    $grmfile.ind: family ID, individual ID (space-separated)
    """
    n = K.shape[0]
    # Calculate GRM
    f = gzip.open("%s.grm.gz"%grmfile, "wb")
    for i in range(n):
        for j in range(i, n):
            val = K[i, j]
#it was     f.write(" ".join(map(str, [j+1, i+1, p, val]))+"\n")
            f.write(bytes(" ".join(map(str, [j+1, i+1, p, val]))+"\n" , 'UTF-8'))
    f.close()
    # Write ind file
    f = open("%s.grm.id"%grmfile, "w")
    for i in range(n):
        f.write(" ".join(map(str, [i, i]))+"\n")
    f.close()

def ParseGCTAResults(gctafile, include_str):
    """
    return cis_snp_h2, cis_snp_h2_se, cis_str_h2, cis_str_h2_se, logL
    """
    f = open(gctafile, "r")
    lines = f.readlines()
    cis_str_h2 = None
    cis_str_h2_se = None
    for line in lines:
        items = line.strip().split()
        if len(items) < 1: continue
        if include_str == "NO" or include_str == "FE" or include_str == "SAMPLES":
            if items[0] == "V(G)":
                cis_snp_h2 = items[1]
                cis_snp_h2_se = items[2]
        else:
            if items[0] == "V(G1)":
                cis_snp_h2 = items[1]
                cis_snp_h2_se = items[2]
            if items[0] == "V(G2)":
                cis_str_h2 = items[1]
                cis_str_h2_se = items[2]
        if items[0] == "logL":
            logL = items[1]
        if include_str == "FE":
            if items[0] == "Fix_eff":
                ind = lines.index(line)+2
                items = lines[ind].strip().split()
                cis_str_h2 = float(items[0])**2
                cis_str_h2_se = items[1]
        if items[0] == "Pval":
            Pval = items[1]
    if include_str == "FE":
        return cis_snp_h2, cis_snp_h2_se, cis_str_h2, cis_str_h2_se, logL, Pval
    else:
        return cis_snp_h2, cis_snp_h2_se, cis_str_h2, cis_str_h2_se, logL, 'N/A'

def GetPermutedLocusSTRs(locus_str):
    """
    Return locus_str dataframe with STR genotypes permuted
    """
    gts = list(locus_str.iloc[:,0])
    random.shuffle(gts)
    locus_str_perm = pd.DataFrame({locus_str.columns[0]: gts})
    locus_str_perm.index = locus_str.index
    return locus_str_perm

def z(vals):
    vals = list(map(float, list(vals)))
    m = np.mean(vals)
    s = math.sqrt(np.var(vals))
    return [(item-m)*1.0/s for item in vals]

def ZNorm(locus_vars):
    """
    Znormalize variants
    """
    columns = locus_vars.columns
    for c in columns:
        locus_vars[c] = z(locus_vars[c])
    return locus_vars


def WriteGCTACovarFile(locus, strcovarfile):
    """ Write GCTA covariable file using normalized genotype
    """
    f = open(strcovarfile, "w")
    n=locus.shape[0]
    for i in range(n):
        N_geno = list(locus.iloc[i,:].values)
        f.write(" ".join([str(i), str(i)]+[str(m) for m in N_geno])+"\n")
    f.close()

In [44]:
T = ['WholeBlood','Cells-Transformedfibroblasts','Muscle-Skeletal','Lung','Adipose-Subcutaneous','Artery-Tibial','Esophagus-Mucosa']
BASEDIR = "/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/"
SNPS = "/storage/szfeupe/Runs/650GTEx_estr/SNP_Analysis/"
STRS = "/storage/szfeupe/Runs/650GTEx_estr/Genotypes/STR_Norm_lized_Geno.table"
GENO = pd.read_csv(STRS, sep='\t',low_memory=False)
#Gene_table = pd.read_csv('/storage/szfeupe/Runs/GTEx_estr/FEATURES/Genes_only_table', sep='\t')

tissue = T[4]
EXPRFILE = BASEDIR+tissue+"/Corr_Expr.csv"
EXPRANNOTFILE = '/storage/resources/dbase/human/hg19/gencode_gene_annotations_hg19.csv'
CHROM = "chr1"
REML_NO_CONSTRAIN=True
DISTFROMGENE = 10000
STRGTFILE = STRS
SNPGTFILE = "/storage/szfeupe/Runs/650GTEx_estr/SNP_Analysis/chr1.tab"
OUTFILE = 'here'
SNPMAF = 0.05
LMM_METHOD = "GCTA"
INCLUDE_STR = "FE"
ESTR_RESULTS_FILE = "/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/"+tissue+"/PQValues"
ESTR_GENES_ONLY = 0.1
UNLINKED_CTRL=False

In [45]:
# Load expression and annotation
expr = pd.read_csv(EXPRFILE,sep=',' ,low_memory=False) 
expr_annot = pd.read_csv(EXPRANNOTFILE,low_memory=False )
expr_annot.index = expr_annot["probe.id"].values
# Load SNP genotypes
snpgt = pd.read_csv(SNPGTFILE, sep="\t",  low_memory=False)
# Load STR genotypes
strgt = pd.read_csv(STRGTFILE, sep="\t",low_memory=False)
strgt = strgt.loc[strgt['chrom']==CHROM]
# Restrict to STR samples
str_samples = list(set(strgt.columns[2:].values).intersection(set(snpgt.columns[2:].values)))
expr = expr.loc[str_samples,:]
snpgt = snpgt[["chrom","start"] + str_samples]
strgt = strgt[["chrom","start"] + str_samples]
# Load STR results
estr_results = pd.read_csv(ESTR_RESULTS_FILE, sep='\t')

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [46]:
UNLINKED_CTRL=False
REML_NO_CONSTRAIN=False
ESTR_GENES_ONLY = 1
genelist = ['ENSG00000159445.8']
expr_annot = expr_annot.loc[genelist]
estr_results = estr_results.loc[estr_results['gene'].isin(genelist)]
expr = expr[genelist]

# For each gene, pull out data and perform specified method
for i in range(expr_annot.shape[0]):
    gene = expr_annot.index.values[i]
    ensgene = expr_annot["gene.id"].values[i]
    print("Getting data for %s"%gene)
    genedir ='/home/szfeupe/projects/GTEX_eSTRs/'
    start = expr_annot["gene.start"].values[i]
    end = expr_annot["gene.stop"].values[i]
# Pull out STRs
    samples_to_keep = str_samples
    best_str_start = None
    if INCLUDE_STR != "NO":
        try:
            if UNLINKED_CTRL:
                print('unlinked')
                possible_starts = list(strgt[(strgt["start"] >= (start-DISTFROMGENE)) & (strgt["start"] <= (end+DISTFROMGENE))].start)
                best_str_start = random.sample(possible_starts, 1)[0]
            else:
                print('Linked')
# make sure to match on Ensembl gene (gene is ILMN if using array)
                best_str_start = estr_results[estr_results["gene"]==ensgene].sort_values("p.wald")["str.start"].values[0]
        except:
            print("[%s]\tERROR: couldn't find STR LMM results"%gene)
            continue
        try:
            cis_strs = strgt[(strgt["start"] >= (start-DISTFROMGENE)) & (strgt["start"] <= (end+DISTFROMGENE))]
            locus_str = cis_strs[samples_to_keep].transpose()
            locus_str = strgt[(strgt["start"] == best_str_start)].iloc[[0],:][str_samples].transpose()
        except:
            print("[%s]\tERROR: couldn't find STR genotypes for position %s"%(gene, best_str_start))
            continue
        locus_str.index = str_samples
        locus_str.columns = ["STR_%s"%best_str_start]
        ###So far we only choose the best str as fixed effect.But we should consider all cis STRs 
        samples_to_keep = [str_samples[k] for k in range(len(str_samples)) if str(locus_str.iloc[:,0].values[k]) != "None"]
        locus_str = locus_str.loc[samples_to_keep,:]
# Make sure STRs are normalized
        try:
            locus_str = ZNorm(locus_str)
        except:
            print("[%s]\tERROR: couldn't Z normalize STR genotypes"%(gene))
            continue
    # Pull out SNPs
    cis_snps = snpgt[(snpgt["start"] >= (start-DISTFROMGENE)) & (snpgt["start"] <= (end+DISTFROMGENE))]
    locus_snp = cis_snps[samples_to_keep].transpose()
    locus_snp.index = samples_to_keep
    locus_snp = locus_snp.dropna(axis=1, how='any')
    locus_snp.columns = cis_snps["start"].apply(lambda x: "SNP_%s"%x)
    locus_snp_maf = locus_snp.apply(lambda x: GetMAF(x), 0)
    print(locus_snp.columns, ' cis SNPs')
#    
#    if len(locus_snp_maf) == 0:
#        continue    
#    
#    locus_snp = locus_snp.loc[:,[i for i in range(len(locus_snp_maf)) if locus_snp_maf[i]>=SNPMAF]]
    if locus_snp.shape[1] == 0:
        print("[%s]\tERROR: no common SNPs in region"%gene)
        continue
# Get expression
    y = pd.DataFrame({"expr":list(expr.loc[:,gene])})
    y.index = str_samples
    locus_y = y.loc[samples_to_keep,["expr"]]
# Z normalize
    locus_y = (locus_y - np.mean(locus_y))/math.sqrt(np.var(locus_y))
# Make SNP GRM
    locus_snp=locus_snp.apply(pd.to_numeric, errors='coerce')
    locus_snp = locus_snp.dropna(axis=1, how='any')
    K = GRM(locus_snp.transpose())
    if str(np.mean(K)) == "nan":
        print("[%s]\tERROR: nans in GRM"%gene)

# Write GRM
    if LMM_METHOD == "GCTA":
        exprfile = os.path.join(genedir, "expr.pheno")
        mgrmfile = os.path.join(genedir, "mgrm.txt")
        snpgrmfile = os.path.join(genedir, "snp.grm.txt")
        if REML_NO_CONSTRAIN:
            reml_command = "--reml-no-constrain"
        else: reml_command = "--reml"
        gcta_cmd = "/storage/resources/source/gcta64 %s --mgrm-gz %s --pheno %s --out %s/gcta "%(reml_command, mgrmfile, exprfile, genedir)
        g = open(mgrmfile, "w")
        g.write(snpgrmfile+"\n")
        WriteGCTAGRM(K, snpgrmfile, p=locus_snp.shape[1])
        if INCLUDE_STR == "FE": # --qcovar
            strcovarfile = os.path.join(genedir, "str.qcovar")
            WriteGCTACovarFile(locus_str, strcovarfile)
            gcta_cmd += " --qcovar %s --reml-est-fix"%strcovarfile
        if INCLUDE_STR == "RE":
            K_str = GRM(locus_str.transpose())
            strgrmfile = os.path.join(genedir, "str.grm.txt")
            g.write(strgrmfile+"\n")
            WriteGCTAGRM(K_str, strgrmfile, p=locus_str.shape[1])
        g.close()
        locus_y["expr"].fillna("NA", inplace=True)
        WriteGCTAPhenotypeFile(locus_y["expr"].values, exprfile)
        gcta_cmd += " > /dev/null 2>&1"
        os.system(gcta_cmd)
# Parse results
        gctafile = os.path.join(genedir, "gcta.hsq")
        if not os.path.exists(gctafile):
            print("[%s]\tERROR: GCTA could not analyze this gene"%gene)
            continue
        cis_snp_h2, cis_snp_h2_se, cis_str_h2, cis_str_h2_se, logL, pval = ParseGCTAResults(gctafile, INCLUDE_STR)
        # Output results
        if INCLUDE_STR == "NO" or INCLUDE_STR == "SAMPLES":
            word="\t".join(map(str, [CHROM, gene, locus_snp.shape[1], cis_snp_h2, cis_snp_h2_se, logL, len(samples_to_keep)]))
            print(word)
            
        else:
            print( "\t".join(map(str, [CHROM, gene, best_str_start, locus_snp.shape[1], cis_snp_h2, cis_snp_h2_se,\
                                               cis_str_h2, cis_str_h2_se, logL, len(samples_to_keep), pval]))+"\n")  #Figure out this , len(cis_str_h2_null)       
        break
  

Getting data for ENSG00000159445.8
Linked


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.


Index(['SNP_151836675', 'SNP_151836824', 'SNP_151837283', 'SNP_151838066',
       'SNP_151838092', 'SNP_151838723', 'SNP_151839838', 'SNP_151840085',
       'SNP_151841165', 'SNP_151841424', 'SNP_151841525', 'SNP_151841770',
       'SNP_151842485', 'SNP_151842679', 'SNP_151844396', 'SNP_151844650',
       'SNP_151845569', 'SNP_151845650', 'SNP_151845713', 'SNP_151846059',
       'SNP_151846517', 'SNP_151846719', 'SNP_151847126', 'SNP_151847180',
       'SNP_151847193', 'SNP_151847870', 'SNP_151848126', 'SNP_151850072',
       'SNP_151851456', 'SNP_151852017', 'SNP_151853278', 'SNP_151853754',
       'SNP_151854076', 'SNP_151855156', 'SNP_151856953', 'SNP_151857431',
       'SNP_151858888', 'SNP_151858926', 'SNP_151859050', 'SNP_151859153',
       'SNP_151859166', 'SNP_151860063', 'SNP_151861477', 'SNP_151862296',
       'SNP_151862661', 'SNP_151863001', 'SNP_151864267', 'SNP_151865234',
       'SNP_151865439', 'SNP_151865780', 'SNP_151867560', 'SNP_151868098',
       'SNP_151868892', '

In [39]:
#cis_snps.loc[cis_snps['start']==43766426]
#estr_results[estr_results["gene"]==ensgene].sort_values("p.wald")
locus_snp.shape

(617, 29)