In [22]:
import math
import numpy as np
import os
from collections import Counter
import pandas as pd
import random
import shutil
import statsmodels.formula.api as sm
from statsmodels.stats.anova import anova_lm
import sys


def Regression(data,norm=True, minsamples=120, linear=True):
    """
    Perform non linear regression, return beta, p_beta, alpha, p_alpha, se_alpha, se_beta
    data.columns has to have [allele1='x1', allele2='x2', gene_expression='expr'] not normalized
    """
    if data.shape[0] <= minsamples: 
        return None, None, None, None, None, None
    m = np.mean(list(locus_str["x1"].astype(int))+list(locus_str["x2"].astype(int))) 
    sd= math.sqrt(np.var(locus_str["x1"].astype(int) + locus_str["x2"].astype(int) ))
    if norm:
        data['x1'] = ZNorm(data['x1'].astype(int), sd, m)
        data['x2'] = ZNorm(data['x2'].astype(int), sd, m)
        E=data['expr'].astype(float)
        data['expr'] = ZNorm(E, np.mean(E), math.sqrt(np.var(E)))
        if data['x1'].isnull().all()  or data['x2'].isnull().all() or data['expr'].isnull().all(): 
            return None, None, None, None, None, None  
    data['X']=data['x1']+data['x2']
    data['X2']=data['x1']**2 + data['x2']**2
    model = sm.ols(formula = 'expr ~ X + X2', data = data).fit()
    #model = sm.OLS(Locus_detail[['expr']],Locus_detail[['X','X2']] , missing='drop').fit()
    #print(data.head(3), '\n..(1)..\n',data.head(3))
    mod_ols = sm.ols(formula = 'expr ~ X + X2', data = data).fit()
    alpha = list(mod_ols.params)[1]
    beta = list(mod_ols.params)[2]
    alpha_p = list(mod_ols.pvalues)[1]
    beta_p  = list(mod_ols.pvalues)[2]
    alpha_se=list(mod_ols.bse)[1]
    beta_se=list(mod_ols.bse)[2]
    quad_rsq = mod_ols.rsquared
    if linear: 
        #print(data.head(3), '\n..(2)..\n',data.head(3))
        X=data['X'].astype(int).values
        mod_ols1 = sm.OLS(data['expr'], X, missing='drop').fit()
        pval = mod_ols1.pvalues[0]
        slope = mod_ols1.params[0]
        err = mod_ols1.bse[0]
        lin_rsq = mod_ols1.rsquared
        anova_results = anova_lm(mod_ols, mod_ols1)
        anova_pval = anova_results["Pr(>F)"].values[1]
        delta_aic = mod_ols.aic - mod_ols1.aic
        
        return alpha, alpha_se, alpha_p, beta, beta_se, beta_p, slope, err, pval,quad_rsq, lin_rsq, delta_aic, anova_pval
    else:
        return alpha, alpha_se, alpha_p, beta, beta_se, beta_p
    
    


def ZNorm(vals, sd, m):
#    print m, '  ', sd
    if sd == 0: return None
    return [(item-m)/sd for item in vals]

STRGTFILE= '/storage/szfeupe/Runs/650GTEx_estr/Genotypes/Allele_Gentypes.table'
EXPRESSION='/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Adipose-Subcutaneous/Corr_Expr.csv'
EXPRANNOT='/storage/resources/dbase/human/hg19/gencode_gene_annotations_hg19.csv'
distfromgene = 100000
CHROM='chr22'
NORM=True

In [2]:
# Load STR genotypes
print("Load STRs")
strgt = pd.read_csv(STRGTFILE, sep="\t", low_memory=False)
strgt = strgt[strgt["chrom"] == CHROM]
print("Load expression")
expr = pd.read_csv(EXPRESSION)
print("Load annotation")
expr_annot = pd.read_csv(EXPRANNOT)
expr_annot.index = expr_annot["probe.id"].values
expr_annot = expr_annot.loc[[item for item in expr.columns if item in expr_annot.index],:]
expr_annot = expr_annot[expr_annot["gene.chr"] == CHROM]
# Restrict to STR samples
str_samples = list(set(strgt.columns[2:].values))
samples_to_remove = []
for item in str_samples:
    if item not in expr.index: samples_to_remove.append(item) #str_samples.remove(item)
for item in samples_to_remove: str_samples.remove(item)
expr = expr.loc[str_samples,:]
strgt = strgt[['chrom','start']+str_samples]
print("There are %s samples"%str(strgt.shape))


Load STRs
Load expression
Load annotation
There are (2249, 272) samples


In [23]:
NORM=True
MINSAMPLES=150
for i in range(expr_annot.shape[0]):
    gene = expr_annot.index.values[i]
    print(" Getting data for %s"%gene)
    start = expr_annot["gene.start"].values[i]
    end = expr_annot["gene.stop"].values[i]
    cis_strs = strgt[(strgt["start"] >= (start-distfromgene)) & (strgt["start"] <= (end+distfromgene))]
    print(gene, '\t',start,'\t',end,'\t',"%s STRs tested \n"%str(cis_strs.shape[0]))
    y = pd.DataFrame({"expr":list(expr.loc[:, gene])})
    y.index = str_samples
    
    for j in range(cis_strs.shape[0]):
        locus_str = cis_strs.iloc[[j],:][str_samples].transpose()
        locus_str.index = str_samples
        locus_str.columns = ["STR_%s"%(cis_strs["start"].values[j])]
        test_str=locus_str.columns[0]
        locus_str['x1'] = locus_str[test_str].apply(lambda x: x.split(',')[0] )
        locus_str['x2'] = locus_str[test_str].apply(lambda x: x.split(',')[1] )
        samples_to_keep = [str_samples[k] for k in range(len(str_samples)) if (str(locus_str['x1'].values[k]) != "NA")and(str(locus_str['x2'].values[k]) != "NA")]   
        locus_str = locus_str.loc[samples_to_keep,:]        
        locus_y = y.loc[samples_to_keep,:]            
        Locus_detail = locus_str.join(locus_y)
        locus_str = locus_str.join(locus_y)[['expr','x1','x2']]
        print(locus_str.head(3), '\n...\n',Locus_detail.head(3))
        # Run regression
        alpha, alpha_se, alpha_pval, beta, beta_se, beta_pval = Regression(locus_str, norm=NORM, minsamples=MINSAMPLES, linear=False)
        print(' '.join([CHROM, gene, test_str, str(alpha), str(alpha_se), str(alpha_pval), str(beta), str(beta_se), str(beta_pval)]) ) 
        locus_str = Locus_detail[['expr','x1','x2']]
        alpha, alpha_se, alpha_pval, beta, beta_se, beta_pval,slope, err, pval, quad_rsq, lin_rsq, delta_aic, anova_pval = Regression(locus_str, norm=NORM, minsamples=MINSAMPLES, linear=True)
        print(' '.join([CHROM, gene, test_str, str(alpha), str(alpha_se), str(alpha_pval), str(beta), str(beta_se), str(beta_pval),str(slope), str(err), str(pval), str(quad_rsq), str(lin_rsq), str(delta_aic), str(anova_pval)])+'\n' ) 
    #Anova    
               
    break
#Locus_detail 

 Getting data for ENSG00000177663.9
ENSG00000177663.9 	 17565844 	 17596583 	 18 STRs tested 

                expr  x1 x2
GTEX-146FR -0.192793   0  0
GTEX-145MN  0.414669  -1  1
GTEX-QEG4  -0.168807   0  0 
...
            STR_17520209  x1 x2      expr
GTEX-146FR          0,0   0  0 -0.192793
GTEX-145MN         -1,1  -1  1  0.414669
GTEX-QEG4           0,0   0  0 -0.168807
chr22 ENSG00000177663.9 STR_17520209 11.211791731741432 7.033065083342418 0.11227468764549406 -2.827394502812847 2.898397072380947 0.33033577796732827


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


chr22 ENSG00000177663.9 STR_17520209 11.211791731741432 7.033065083342418 0.11227468764549406 -2.827394502812847 2.898397072380947 0.33033577796732827 24.357351733587684 4.481666125370415 1.3846704979146895e-07 0.018218778024452598 0.11293962267712121 -139.5617166177799 nan

                expr x1  x2
GTEX-146FR -0.192793  0   0
GTEX-145MN  0.414669  0  -7
GTEX-QEG4  -0.168807  0   0 
...
            STR_17548440 x1  x2      expr
GTEX-146FR          0,0  0   0 -0.192793
GTEX-145MN         0,-7  0  -7  0.414669
GTEX-QEG4           0,0  0   0 -0.168807
chr22 ENSG00000177663.9 STR_17548440 46.37333119442749 4.734115714414624 1.5188417083375316e-19 32.62763384233208 3.6351452593639744 5.238351164832721e-17
chr22 ENSG00000177663.9 STR_17548440 46.37333119442749 4.734115714414624 1.5188417083375316e-19 32.62763384233208 3.6351452593639744 5.238351164832721e-17 -106.84684238729452 19.620992862514967 1.1715106884334487e-07 0.0024912081537364905 0.09996118636718132 -158.06793019461566 nan

   

In [12]:
locus_str

Unnamed: 0,expr,x1,x2,X,X2
GTEX-146FR,59.752253,-0.193758,-0.193758,-0.387516,0.075084
GTEX-145MN,0.322749,-2.849384,2.461868,-0.387516,14.179787
GTEX-QEG4,57.405650,-0.193758,-0.193758,-0.387516,0.075084
GTEX-ZV68,66.082340,-0.193758,2.461868,2.268110,6.098337
GTEX-12ZZY,57.110356,-0.193758,-0.193758,-0.387516,0.075084
GTEX-111VG,57.119746,-0.193758,-0.193758,-0.387516,0.075084
GTEX-Y3I4,57.110986,-0.193758,-0.193758,-0.387516,0.075084
GTEX-131YS,67.424779,-0.193758,-0.193758,-0.387516,0.075084
GTEX-13OVL,-6.835643,-0.193758,-0.193758,-0.387516,0.075084
GTEX-12WSM,60.826336,-0.193758,-0.193758,-0.387516,0.075084


In [13]:
Locus_detail

Unnamed: 0,STR_17520209,x1,x2,expr
GTEX-146FR,00,0,0,-0.192793
GTEX-145MN,-11,-1,1,0.414669
GTEX-QEG4,00,0,0,-0.168807
GTEX-ZV68,01,0,1,-0.257497
GTEX-12ZZY,00,0,0,-0.165789
GTEX-111VG,00,0,0,-0.165885
GTEX-Y3I4,00,0,0,-0.165795
GTEX-131YS,00,0,0,-0.271218
GTEX-13OVL,00,0,0,0.487839
GTEX-12WSM,00,0,0,-0.203772
