In [None]:
Include the intercept to the model because
expression data, remove the s,d 
expression should be float
beta in linear compared to beta in quad


In [7]:
import math
import numpy as np
import os
from collections import Counter
import pandas as pd
import random
import shutil
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
import sys


def Regression(data,norm=True, minsamples=120, linear=True):
    """
    Perform non linear regression, return beta, p_beta, alpha, p_alpha, se_alpha, se_beta
    data.columns has to have [allele1='x1', allele2='x2', gene_expression='expr'] not normalized
    """
    if data.shape[0] <= minsamples: 
        return None, None, None, None, None, None
    m = np.mean(list(locus_str["x1"].astype(int))+list(locus_str["x2"].astype(int))) 
    sd= math.sqrt(np.var(locus_str["x1"].astype(int) + locus_str["x2"].astype(int) ))
    if norm:
        data['x1'] = ZNorm(data['x1'].astype(int), sd, m)
        data['x2'] = ZNorm(data['x2'].astype(int), sd, m)
        E=data['expr'].astype(float)
        data['expr'] = ZNorm(E, np.mean(E), math.sqrt(np.var(E)))
        if data['x1'].isnull().all()  or data['x2'].isnull().all() or data['expr'].isnull().all(): 
            return None, None, None, None, None, None  
    data['X']=data['x1']+data['x2']
    data['X2']=data['x1']**2 + data['x2']**2
    model = sm.ols(formula = 'expr ~ X + X2', data = data).fit()
    #model = sm.OLS(Locus_detail[['expr']],Locus_detail[['X','X2']] , missing='drop').fit()
    #print(data.head(3), '\n..(1)..\n',data.head(3))
    mod_ols = sm.ols(formula = 'expr ~ X + X2', data = data).fit()
    alpha = list(mod_ols.params)[1]
    beta = list(mod_ols.params)[2]
    alpha_p = list(mod_ols.pvalues)[1]
    beta_p  = list(mod_ols.pvalues)[2]
    alpha_se=list(mod_ols.bse)[1]
    beta_se=list(mod_ols.bse)[2]
    quad_rsq = mod_ols.rsquared
    if linear: 
        #print(data.head(3), '\n..(2)..\n',data.head(3))
        X=data['X'].astype(int).values
        mod_ols1 = sm.OLS(data['expr'], X, missing='drop').fit()
        pval = mod_ols1.pvalues[0]
        slope = mod_ols1.params[0]
        err = mod_ols1.bse[0]
        lin_rsq = mod_ols1.rsquared
        anova_results = anova_lm(mod_ols, mod_ols1)
        anova_pval = anova_results["Pr(>F)"].values[1]
        delta_aic = mod_ols.aic - mod_ols1.aic
        
        return alpha, alpha_se, alpha_p, beta, beta_se, beta_p, slope, err, pval,quad_rsq, lin_rsq, delta_aic, anova_pval
    else:
        return alpha, alpha_se, alpha_p, beta, beta_se, beta_p
    
    


def ZNorm(vals,m=None,sd=None):
    if m is None:
        m = np.mean(vals)
        sd = math.sqrt(np.var(vals))
    if sd == 0: return None
    return [(item-m)/sd for item in vals]

def LinearRegression(data, Y, norm=False, minsamples=0, alleles=False):
    """
    Perform linear regression, return beta, beta_se, p
    """

    if norm:
        if alleles:
            data['x1']=data['x1'].astype(float) ; data['x2']=data['x2'].astype(float)
            data['x1+x2'] = data[['x1', 'x2']].sum(axis=1)
            X = ZNorm(data['x1+x2'], None, None)
        else:
            X = ZNorm(data) 
            Y = ZNorm(Y, None, None)
        if X is None or Y is None: return None, None, None
        if np.var(X)==0: return None, None, None
        if len(X) <= minsamples: return None, None, None
    else:
        X = data
    mod_ols = sm.OLS(Y, X, missing='drop')
    res_ols = mod_ols.fit()
    #print(res_ols.pvalues)
    pval = res_ols.pvalues[0]
    #print 'P-value: ', pval 
    slope = res_ols.params[0]
    err = res_ols.bse[0]
    return res_ols, slope, err, pval

def QuadraticRegression(data,norm=True, minsamples=120):
    """
    Perform non linear regression, return beta, p_beta, alpha, p_alpha, se_alpha, se_beta
    data.columns has to have [allele1='x1', allele2='x2', gene_expression='expr'] not normalized
    """
    if data.shape[0] <= minsamples: 
        return None, None, None, None, None, None, None
    m = np.mean(list(locus_str["x1"].astype(int))+list(locus_str["x2"].astype(int))) 
    sd= math.sqrt(np.var(locus_str["x1"].astype(int) + locus_str["x2"].astype(int) ))
    if norm:
        data.loc[:,'x1'] =ZNorm(data['x1'].astype(int), sd, m)
        data.loc[:,'x2'] =ZNorm(data['x2'].astype(int), sd, m)       
        data.loc[:,'expr'] =ZNorm(data['expr'].astype(int))
        if data['x1'].isnull().all()  or data['x2'].isnull().all() or data['expr'].isnull().all(): 
            return None, None, None, None, None, None, None  
        
    data['X']=data['x1']+data['x2']
    data['X2']=data['x1']**2 + data['x2']**2
    
    mod_ols = sm.OLS('expr ,X ,X2', data = data).fit()
    #mod_ols = smf.ols(formula = 'expr ~ X + X2', data = data).fit()
    print(mod_ols.summary())
    alpha = list(mod_ols.params)[1]
    beta = list(mod_ols.params)[2]
    alpha_p = list(mod_ols.pvalues)[1]
    beta_p  = list(mod_ols.pvalues)[2]
    alpha_se=list(mod_ols.bse)[1]
    beta_se=list(mod_ols.bse)[2]
    return mod_ols, alpha, alpha_se, alpha_p, beta, beta_se, beta_p

def Modelcompare(model1, model2):
    """
    Performs ANOVA test to compares 2 models 
    Outputs rsquares , delta AIC (aic_model1- aic_model2) and pvalue
    """
    rsq1 = model1.rsquared
    rsq2 = model2.rsquared
    anova_output = sm.stats.anova_lm(model1 , model2)
    #print(anova_output, anova_output["Pr(>F)"])
    anova_pval = anova_output["Pr(>F)"].values[1]
    delta_aic = model1.aic - model2.aic  
    return rsq1, rsq2, delta_aic, anova_pval

STRGTFILE= '/storage/szfeupe/Runs/650GTEx_estr/Genotypes/Allele_Gentypes.table'
EXPRESSION='/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Adipose-Subcutaneous/Corr_Expr.csv'
EXPRANNOT='/storage/resources/dbase/human/hg19/gencode_gene_annotations_hg19.csv'
distfromgene = 100000
CHROM='chr1'
NORM=True

In [8]:
# Load STR genotypes
print("Load STRs")
strgt = pd.read_csv(STRGTFILE, sep="\t", low_memory=False)
strgt = strgt[strgt["chrom"] == CHROM]
print("Load expression")
expr = pd.read_csv(EXPRESSION)
print("Load annotation")
expr_annot = pd.read_csv(EXPRANNOT)
expr_annot.index = expr_annot["probe.id"].values
expr_annot = expr_annot[expr_annot["gene.chr"] == CHROM]
expr_annot = expr_annot.loc[[item for item in expr.columns if item in expr_annot.index],:]
# Restrict to STR samples
str_samples = list(set(strgt.columns[2:].values))
samples_to_remove = []
for item in str_samples:
    if item not in expr.index: samples_to_remove.append(item) #str_samples.remove(item)
for item in samples_to_remove: str_samples.remove(item)
expr = expr.loc[str_samples,:]
strgt = strgt[['chrom','start']+str_samples]
print("There are %s samples"%str(strgt.shape))


Load STRs
Load expression
Load annotation
There are (3051, 272) samples


In [9]:
expr_annot = expr_annot.loc[expr_annot.index == 'ENSG00000158109.10']
NORM=True; ALLELE=True
MINSAMPLES=150
#for i in range(150,expr_annot.shape[0],1):
for i in range(0,1,1):
    gene = expr_annot.index.values[i]
    print(" Getting data for %s"%gene)
    start = expr_annot["gene.start"].values[i]
    end = expr_annot["gene.stop"].values[i]
    cis_strs = strgt[(strgt["start"] >= (start-distfromgene)) & (strgt["start"] <= (end+distfromgene))]
    print(gene, '\t',start,'\t',end,'\t',"%s STRs tested \n"%str(cis_strs.shape[0]))
    y = pd.DataFrame({"expr":list(expr.loc[:, gene])})
    y.index = str_samples
    
    for j in range(cis_strs.shape[0]):
        locus_str = cis_strs.iloc[[j],:][str_samples].transpose()
        locus_str.index = str_samples
        locus_str.columns = ["STR_%s"%(cis_strs["start"].values[j])]
        test_str=locus_str.columns[0]
        str_start = cis_strs["start"].values[j]
        locus_str['x1'] = locus_str[test_str].apply(lambda x: x.split(',')[0] )
        locus_str['x2'] = locus_str[test_str].apply(lambda x: x.split(',')[1] )
        samples_to_keep = [str_samples[k] for k in range(len(str_samples)) if (str(locus_str['x1'].values[k]) != "NA")and(str(locus_str['x2'].values[k]) != "NA")]   
        locus_str = locus_str.loc[samples_to_keep,:]        
        locus_y = y.loc[samples_to_keep,:]            
        Locus_data = locus_str.join(locus_y)
        locus_str = locus_str.join(locus_y)[['expr','x1','x2']]
        #print(locus_str.head(3), '\n...\n',Locus_detail.head(3))
        # Run regression

        data = Locus_data[['expr','x1','x2']]
        quad_model, alpha, alpha_se, alpha_pval, beta, beta_se, beta_pval=QuadraticRegression(data,norm=NORM, minsamples=120)
        data = Locus_data[['x1','x2']]
#        lin_model, slope, err, pval = LinearRegression(data, locus_y["expr"].values, norm=NORM, minsamples=MINSAMPLES, alleles=ALLELE)
#        quad_rsq, lin_rsq, delta_aic, anova_pval = Modelcompare(lin_model , quad_model)
#        print("\t".join(map(str, [CHROM, gene, test_str, str_start,alpha, alpha_se, alpha_pval, beta, beta_se, beta_pval, slope, err, pval,quad_rsq, lin_rsq, delta_aic, anova_pval] ) )+"\n")                
        print("\t".join(map(str, [CHROM, gene, str_start,alpha, alpha_se, alpha_pval, beta, beta_se, beta_pval] ) )+"\n")                
        
    break
#Locus_detail 

 Getting data for ENSG00000158109.10
ENSG00000158109.10 	 3541566 	 3546691 	 3 STRs tested 

chr1	ENSG00000158109.10	3501711	None	None	None	None	None	None



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


ValueError: unrecognized data structures: <class 'str'> / <class 'NoneType'>

In [44]:
data = Locus_data[['expr','x1','x2']]
data
QuadraticRegression(data,norm=NORM, minsamples=120)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


(None, None, None, None, None, None, None)

In [5]:
expr_annot
#expr_annot.loc[expr_annot.index =='ENSG00000143727.11']

Unnamed: 0,gene.chr,gene.start,gene.stop,gene.id,gene.strand,probe.chr,probe.start,probe.stop,probe.id,probe.strand
ENSG00000184731.5,chr2,38814,46870,ENSG00000184731.5,-,chr2,38814,46870,ENSG00000184731.5,-
ENSG00000035115.17,chr2,217730,266398,ENSG00000035115.17,-,chr2,217730,266398,ENSG00000035115.17,-
ENSG00000143727.11,chr2,264393,278283,ENSG00000143727.11,+,chr2,264393,278283,ENSG00000143727.11,+
ENSG00000189292.11,chr2,279558,288851,ENSG00000189292.11,-,chr2,279558,288851,ENSG00000189292.11,-
ENSG00000151353.10,chr2,667335,676704,ENSG00000151353.10,-,chr2,667335,676704,ENSG00000151353.10,-
ENSG00000172554.7,chr2,946554,1371385,ENSG00000172554.7,+,chr2,946554,1371385,ENSG00000172554.7,+
ENSG00000115705.16,chr2,1377995,1547483,ENSG00000115705.16,+,chr2,1377995,1547483,ENSG00000115705.16,+
ENSG00000130508.6,chr2,1635659,1748624,ENSG00000130508.6,-,chr2,1635659,1748624,ENSG00000130508.6,-
ENSG00000186487.13,chr2,1792885,2335032,ENSG00000186487.13,-,chr2,1792885,2335032,ENSG00000186487.13,-
ENSG00000032389.8,chr2,3192696,3381653,ENSG00000032389.8,-,chr2,3192696,3381653,ENSG00000032389.8,-
