In [1]:
import math
import numpy as np
import os
from collections import Counter
import pandas as pd
import random
import shutil
import statsmodels.formula.api as sm
import sys


def nonlinearRegression(data, norm=True, minsamples=120):
    """
    Perform non linear regression, return beta, p_beta, alpha, p_alpha, se_alpha, se_beta
    data.columns has to have [allele1='x1', allele2='x2', gene_expression='expr'] not normalized
    """
    if data.shape[0] <= minsamples: 
        return None, None, None, None, None, None
    if norm:
        data['x1'] = ZNorm(data['x1'].astype(int))
        data['x2'] = ZNorm(data['x2'].astype(int))
        data['expr'] = ZNorm(data['expr'].astype(int))
        if data['x1'].isnull().all()  or data['x2'].isnull().all() or data['expr'].isnull().all(): 
            return None, None, None, None, None, None  
    data['X']=data['x1']+data['x2']
    data['X2']=data['x1']**2 + data['x2']**2
    model = sm.ols(formula = 'expr ~ X + X2', data = data).fit()
    #model = sm.OLS(Locus_detail[['expr']],Locus_detail[['X','X2']] , missing='drop').fit()
    mod_ols = sm.ols(formula = 'expr ~ X + X2', data = Locus_detail).fit()
    alpha = list(mod_ols.params)[1]
    beta = list(mod_ols.params)[2]
    alpha_p = list(mod_ols.pvalues)[1]
    beta_p  = list(mod_ols.pvalues)[2]
    alpha_se=list(mod_ols.bse)[1]
    beta_se=list(mod_ols.bse)[2]
    return alpha, alpha_se, alpha_p, beta, beta_se, beta_p


def ZNorm(vals):
    m = np.mean(vals)
    sd = math.sqrt(np.var(vals))
#    print m, '  ', sd
    if sd == 0: return None
    return [(item-m)/sd for item in vals]

STRGTFILE= '/storage/szfeupe/Runs/650GTEx_estr/Genotypes/Allele_Gentypes.table'
EXPRESSION='/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Adipose-Subcutaneous/Corr_Expr.csv'
EXPRANNOT='/storage/resources/dbase/human/hg19/gencode_gene_annotations_hg19.csv'
distfromgene = 100000
CHROM='chr22'
NORM=True

In [2]:
# Load STR genotypes
print("Load STRs")
strgt = pd.read_csv(STRGTFILE, sep="\t", low_memory=False)
strgt = strgt[strgt["chrom"] == CHROM]
print("Load expression")
expr = pd.read_csv(EXPRESSION)
print("Load annotation")
expr_annot = pd.read_csv(EXPRANNOT)
expr_annot.index = expr_annot["probe.id"].values
expr_annot = expr_annot.loc[[item for item in expr.columns if item in expr_annot.index],:]
expr_annot = expr_annot[expr_annot["gene.chr"] == CHROM]
# Restrict to STR samples
str_samples = list(set(strgt.columns[2:].values))
samples_to_remove = []
for item in str_samples:
    if item not in expr.index: samples_to_remove.append(item) #str_samples.remove(item)
for item in samples_to_remove: str_samples.remove(item)
expr = expr.loc[str_samples,:]
strgt = strgt[['chrom','start']+str_samples]
print("There are %s samples"%str(strgt.shape))


Load STRs
Load expression
Load annotation
There are (2249, 272) samples


In [61]:
NORM=True
MINSAMPLES=150
for i in range(expr_annot.shape[0]):
    gene = expr_annot.index.values[i]
    print(" Getting data for %s"%gene)
    start = expr_annot["gene.start"].values[i]
    end = expr_annot["gene.stop"].values[i]
    cis_strs = strgt[(strgt["start"] >= (start-distfromgene)) & (strgt["start"] <= (end+distfromgene))]
    print(gene, '\t',start,'\t',end,'\t',"%s STRs tested \n"%str(cis_strs.shape[0]))
    y = pd.DataFrame({"expr":list(expr.loc[:, gene])})
    y.index = str_samples
    
    for j in range(cis_strs.shape[0]):
        locus_str = cis_strs.iloc[[j],:][str_samples].transpose()
        locus_str.index = str_samples
        locus_str.columns = ["STR_%s"%(cis_strs["start"].values[j])]
        test_str=locus_str.columns[0]
        locus_str['x1'] = locus_str[test_str].apply(lambda x: x.split(',')[0] )
        locus_str['x2'] = locus_str[test_str].apply(lambda x: x.split(',')[1] )
        samples_to_keep = [str_samples[k] for k in range(len(str_samples)) if (str(locus_str['x1'].values[k]) != "NA")and(str(locus_str['x2'].values[k]) != "NA")]   
        locus_str = locus_str.loc[samples_to_keep,:]
        
        locus_y = y.loc[samples_to_keep,:]
        Locus_detail = locus_str.join(locus_y)[['expr','x1','x2']]
        # Run regression
        alpha, alpha_se, alpha_pval, beta, beta_se, beta_pval = nonlinearRegression(Locus_detail, norm=NORM, minsamples=MINSAMPLES)
        print(' '.join([CHROM, gene, test_str, str(alpha), str(alpha_se), str(alpha_pval), str(beta), str(beta_se), str(beta_pval)]) ) 
    break
#Locus_detail 

 Getting data for ENSG00000177663.9
ENSG00000177663.9 	 17565844 	 17596583 	 18 STRs tested 

chr22 ENSG00000177663.9 STR_17520209 -0.08777685638099877 0.052439470380822265 0.09551549171104882 0.004348578266959254 0.008137920492345154 0.5936087033955663
chr22 ENSG00000177663.9 STR_17548440 0.2555310837450853 0.2940432406431531 0.38561954963639533 0.10168459399217601 0.11997933526055324 0.3974714919591664
chr22 ENSG00000177663.9 STR_17563147 0.017119300110312412 0.042549075420629044 0.6877563458206708 0.0003800788160701652 0.012910697374990893 0.9765365780885986
2
chr22 ENSG00000177663.9 STR_17563928 None None None None None None
chr22 ENSG00000177663.9 STR_17567263 -0.03651590175096972 0.049154985069990664 0.45832676950953466 0.004229593477949099 0.010299317211768022 0.6817038329848328
chr22 ENSG00000177663.9 STR_17572568 0.009673598994085732 0.055275653955245876 0.8612369549110197 0.0003577837856613477 0.005183411527002665 0.9450331746588895
chr22 ENSG00000177663.9 STR_17596478 0.029