In [18]:
import math
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

def ZNorm(vals,m=None,sd=None):
    if m is None:
        m = np.mean(vals)
        sd = math.sqrt(np.var(vals))
    if sd == 0: return None
    return [(item-m)/sd for item in vals]


OUTDIR = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Lung/QuadraticReg/'
STRGTFILE= '/storage/szfeupe/Runs/650GTEx_estr/Genotypes/Allele_Gentypes.table'
EXPRESSION='/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Adipose-Subcutaneous/Corr_Expr.csv'
EXPRANNOT='/storage/resources/dbase/human/hg19/gencode_gene_annotations_hg19.csv'
distfromgene = 100000
CHROM='chr21'


In [19]:
# Load STR genotypes
print("Load STRs")
strgt = pd.read_csv(STRGTFILE, sep="\t", low_memory=False)
strgt = strgt[strgt["chrom"] == CHROM]
print("Load expression")
expr = pd.read_csv(EXPRESSION)
print("Load annotation")
expr_annot = pd.read_csv(EXPRANNOT)
expr_annot.index = expr_annot["probe.id"].values
expr_annot = expr_annot[expr_annot["gene.chr"] == CHROM]
expr_annot = expr_annot.loc[[item for item in expr.columns if item in expr_annot.index],:]
# Restrict to STR samples
str_samples = list(set(strgt.columns[2:].values))
samples_to_remove = []
for item in str_samples:
    if item not in expr.index: samples_to_remove.append(item) #str_samples.remove(item)
for item in samples_to_remove: str_samples.remove(item)
expr = expr.loc[str_samples,:]
strgt = strgt[['chrom','start']+str_samples]
print("There are %s samples"%str(strgt.shape))

Load STRs
Load expression
Load annotation
There are (2345, 272) samples


In [None]:
    #MODEL 1
        qmod_ols = sm.OLS('expr ,X ,X2', data = data).fit()
        print(qmod_ols.fvalue, qmod_ols.f_pvalue)
    #MODEL 2
        lmod_ols = sm.OLS('expr ,X', missing='drop').fit()
        print(lmod_ols...fvalue, lmod_ols.f_pvalue)
    #MODEL 3
        mod_ols = sm.OLS('expr, X2', missing='drop').fit
        print(mod_ols.fvalue, mod_ols.f_pvalue)
        
        
    #F-Test and ANOVA
        sm.stats.anova_lm(model1 , model2)
        formula = 'expr ~ X + X2'
        mod_ols = ols(formula, data).fit()
        hypo_tested = '(X2 = 0)'
        f_test = mod_ols.f_test(hypo_tested)
        print('>>>> Is beta = 0 ? **** ',f_test)
        test2 = np.identity(len(mod_ols.params))[1:,:]
        print('***are alpha and beta significatly !=0 ? **** ', mod_ols.f_test(test2))

In [None]:
#expr_annot = expr_annot.loc[expr_annot.index == 'ENSG00000158109.10']
List_Genes_for_Quad_test = []
NORM=True; ALLELE=True
MINSAMPLES=150
#for i in range(150,expr_annot.shape[0],1):
for i in range(0,expr_annot.shape[0],1):
    gene = expr_annot.index.values[i]
    print(" Getting data for %s"%gene)
    start = expr_annot["gene.start"].values[i]
    end = expr_annot["gene.stop"].values[i]
    cis_strs = strgt[(strgt["start"] >= (start-distfromgene)) & (strgt["start"] <= (end+distfromgene))]
    print(gene, '\t',start,'\t',end,'\t',"%s STRs tested \n"%str(cis_strs.shape[0]))
    y = pd.DataFrame({"expr":list(expr.loc[:, gene])})
    y.index = str_samples
    print('#****\tTesting %s STRs for this gene'%str(cis_strs.shape[0]))
    for j in range(cis_strs.shape[0]):
        locus_str = cis_strs.iloc[[j],:][str_samples].transpose()
        locus_str.index = str_samples
        locus_str.columns = ["STR_%s"%(cis_strs["start"].values[j])]
        test_str=locus_str.columns[0]
        str_start = cis_strs["start"].values[j]
        locus_str['x1'] = locus_str[test_str].apply(lambda x: x.split(',')[0] )
        locus_str['x2'] = locus_str[test_str].apply(lambda x: x.split(',')[1] )
        samples_to_keep = [str_samples[k] for k in range(len(str_samples)) if (str(locus_str['x1'].values[k]) != "NA")and(str(locus_str['x2'].values[k]) != "NA")]   
        locus_str = locus_str.loc[samples_to_keep,:]        
        locus_y = y.loc[samples_to_keep,:]            
        Locus_data = locus_str.join(locus_y)
        locus_str = locus_str.join(locus_y)[['expr','x1','x2']]
        #print(locus_str.head(3), '\n...\n',Locus_detail.head(3))

    # Prepare table for regression
        data = Locus_data[['expr','x1','x2']]
        m = np.mean(list(locus_str["x1"].astype(int))+list(locus_str["x2"].astype(int))) 
        sd= math.sqrt(np.var(locus_str["x1"].astype(int) + locus_str["x2"].astype(int) ))
        data.loc[:,'x1'] =ZNorm(data['x1'].astype(int), sd, m)
        data.loc[:,'x2'] =ZNorm(data['x2'].astype(int), sd, m)       
        data.loc[:,'expr'] =ZNorm(data['expr'].astype(int))
        if data['x1'].isnull().all()  or data['x2'].isnull().all() or data['expr'].isnull().all():
            print ('One or more values are NULL! skipping STR...')
            continue
        else:
            n=1
        data['X']=data['x1']+data['x2']                          #alpha(x1+x2)
        data['X2']=data['x1']**2 + data['x2']**2                 #beta(sq(x1)+sq(x2))
    #REGRESSION    
    ###Each time we test if each coefficient is statistically different from 0
    #Decide of model
    #MODEL 1
        formula = 'expr ~ X + X2'
        qmod_ols = ols(formula, data).fit()
        hypo_tested = '(X2 = 0)'
        f_test = qmod_ols.f_test(hypo_tested)
        #print('>>>> Is beta = 0 ? **** ',qmod_ols.f_pvalue, '\t', f_test.pvalue) 
        
    #Results
        F = f_test.fvalue
        p_value = f_test.pvalue
        if p_value <=0.05:
    #MODEL 2
            lmod_ols = ols('expr ~ X', data=data, missing='drop').fit()
    #Which one is better?
            Anova_results = anova_lm(lmod_ols , qmod_ols)
            print('Adding X2 improve expression prediction model ? ****\n', Anova_results)
            if Anova_results["Pr(>F)"].values[1] <=0.05:
                test2 = np.identity(len(mod_ols.params))[1:,:]
                print('are alpha and beta significatly !=0 ? **** ', mod_ols.f_test(test2))
                List_Genes_for_Quad_test.append(gene+','+CHROM+','+str(str_start)+str(F[0][0])+','+str(str_start)+','+str(Anova_results["Pr(>F)"].values[1]))  

f=open(OUTDIR+'Candidates_genes_4_quad_model_anova.csv', 'w')
f.write('gene,chrom,str_start,F-stat,F_pvalue,anova\n')
f.write('\n'.join(List_Genes_for_Quad_test))
f.close()
print('\n'.join(List_Genes_for_Quad_test[:10]))

 Getting data for ENSG00000188992.7
ENSG00000188992.7 	 15481134 	 15583166 	 24 STRs tested 

#****	Testing 24 STRs for this gene
Adding X2 improve expression prediction model ? ****
    df_resid         ssr  df_diff   ss_diff         F    Pr(>F)
0     231.0  232.981392      0.0       NaN       NaN       NaN
1     230.0  226.556328      1.0  6.425064  6.522725  0.011297
are alpha and beta significatly !=0 ? ****  <F test: F=array([[1.46130967]]), p=0.23383079392298134, df_denom=260, df_num=2>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


 Getting data for ENSG00000185272.9
ENSG00000185272.9 	 15588451 	 15600693 	 19 STRs tested 

#****	Testing 19 STRs for this gene
Adding X2 improve expression prediction model ? ****
    df_resid         ssr  df_diff   ss_diff         F    Pr(>F)
0     264.0  265.980073      0.0       NaN       NaN       NaN
1     263.0  261.216759      1.0  4.763314  4.795831  0.029407
are alpha and beta significatly !=0 ? ****  <F test: F=array([[1.46130967]]), p=0.23383079392298134, df_denom=260, df_num=2>
Adding X2 improve expression prediction model ? ****
    df_resid         ssr  df_diff    ss_diff          F    Pr(>F)
0     264.0  265.096525      0.0        NaN        NaN       NaN
1     263.0  251.050962      1.0  14.045563  14.714076  0.000157
are alpha and beta significatly !=0 ? ****  <F test: F=array([[1.46130967]]), p=0.23383079392298134, df_denom=260, df_num=2>
Adding X2 improve expression prediction model ? ****
    df_resid         ssr  df_diff  ss_diff         F    Pr(>F)
0     257.0

In [None]:
	Testing 24 STRs for this gene
>>>> Is beta = 0  <F test: F=array([[6.52272507]]), p=0.011296858467019884, df_denom=230, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[3.2708079]]), p=0.03974951595111735, df_denom=230, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.10913066]]), p=0.7414083557571096, df_denom=254, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.08510146]]), p=0.9184452762288461, df_denom=254, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.46402663]]), p=0.49636867527929995, df_denom=254, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.840262]]), p=0.4327935250652042, df_denom=254, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.45108972]]), p=0.5024331417413783, df_denom=252, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.22712927]]), p=0.7969807122039558, df_denom=252, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.00806919]]), p=0.9284955682303655, df_denom=249, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.0169991]]), p=0.9831457110016488, df_denom=249, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.00506218]]), p=0.9433379406372907, df_denom=242, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.00316182]]), p=0.9968432121146652, df_denom=242, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.04177419]]), p=0.8382071114146905, df_denom=266, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.02271119]]), p=0.9775466582342074, df_denom=266, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.05949083]]), p=0.8075009686794087, df_denom=252, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.03419285]]), p=0.9663895993328824, df_denom=252, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.07700193]]), p=0.7816482306456368, df_denom=233, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.05098065]]), p=0.9503076525123617, df_denom=233, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.01558227]]), p=0.9007525400588421, df_denom=268, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.0079662]]), p=0.9920656855439975, df_denom=268, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.01290217]]), p=0.9096624435152846, df_denom=233, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.16328395]]), p=0.8494470894001349, df_denom=233, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.02571204]]), p=0.8727387975678107, df_denom=243, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.17673329]]), p=0.8381108724452975, df_denom=243, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.37752443]]), p=0.5395930680014236, df_denom=211, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.96357397]]), p=0.3831992359300672, df_denom=211, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.79748217]]), p=0.37266361794823344, df_denom=263, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.73212273]]), p=0.48186454814047475, df_denom=263, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.09959487]]), p=0.7526000531110633, df_denom=232, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.05545228]]), p=0.946069707478107, df_denom=232, df_num=2>
>>>> Is beta = 0  <F test: F=array([[1.62106483]]), p=0.20428099751945564, df_denom=221, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[1.27495301]]), p=0.28149126229575955, df_denom=221, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.0125244]]), p=0.9109948645236097, df_denom=220, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.08480519]]), p=0.9187212535448299, df_denom=220, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.46730506]]), p=0.49484068675531523, df_denom=259, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.23402244]]), p=0.7915112061619995, df_denom=259, df_num=2>
>>>> Is beta = 0  <F test: F=array([[2.39828343e-06]]), p=0.9987655303195572, df_denom=265, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.00015523]]), p=0.9998447802626191, df_denom=265, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.01395513]]), p=0.906053359256373, df_denom=263, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.59454549]]), p=0.5525532466712586, df_denom=263, df_num=2>
>>>> Is beta = 0  <F test: F=array([[1.37307403]]), p=0.24237381426602309, df_denom=256, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.95306552]]), p=0.38692095496831014, df_denom=256, df_num=2>
>>>> Is beta = 0  <F test: F=array([[2.84616894]]), p=0.09279128677025261, df_denom=260, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[1.6516116]]), p=0.19374585430774355, df_denom=260, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.23192216]]), p=0.6305522872240783, df_denom=234, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[0.14029507]]), p=0.8691748022216459, df_denom=234, df_num=2>
>>>> Is beta = 0  <F test: F=array([[0.34339139]]), p=0.5583865013828422, df_denom=260, df_num=1>
***are alpha and beta significatly !=0 ? ****  <F test: F=array([[1.46130967]]), p=0.23383079392298134, df_denom=260, df_num=2>
