In [106]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy as sp
from scipy import stats
import scipy.stats.mstats as mst
import patsy
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import (pairwise_tukeyhsd, MultiComparison)

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

data = pd.read_csv('901_CleanedData.csv', low_memory=False)

In [107]:
def newIV(row):
    if row['DISC_SCORE0'] < 2:
        val = 0
    else:
        val = 1
    return val

data['DISC'] = data.apply(newIV, axis=1)

In [108]:
def ancova_lm(formula):
    lm = smf.ols(formula, data).fit()
    y,X = patsy.dmatrices(formula, data, return_type='dataframe')
    p_lm = smf.OLS(y, X).fit()
    anova = anova_lm(p_lm, typ=3)
    # print "OLS Results"
    # print p_lm.summary()
    print ""
    print "F stats"
    print anova
    return

In [109]:
formula = 'LV_AGE_DIFF0 ~ C(DISC) + C(RACE) + C(DEGREE) + C(INCOME0) + C(CVD) + BMI0'

ancova_lm(formula)


F stats
                   sum_sq    df           F        PR(>F)
Intercept    25795.505369     1  162.535877  2.316773e-36
C(DISC)        401.636411     1    2.530686  1.117490e-01
C(RACE)       9573.290607     3   20.106903  6.561023e-13
C(DEGREE)     7540.037670     4   11.877327  1.398991e-09
C(INCOME0)    1271.330835     3    2.670192  4.597151e-02
C(CVD)        5763.510207     1   36.315520  1.866361e-09
BMI0         64332.494495     1  405.355051  4.734401e-85
Residual    515478.817336  3248         NaN           NaN


In [110]:
formula = 'LV_AGE_DIFF0 ~ C(DISC) + BMI0'

ancova_lm(formula)


F stats
                  sum_sq    df           F         PR(>F)
Intercept  109924.199813     1  647.977507  1.588688e-130
C(DISC)       712.256286     1    4.198585   4.053764e-02
BMI0       113104.752294     1  666.726122  6.427339e-134
Residual   552863.275577  3259         NaN            NaN


In [111]:
formula = 'LV_AGE_DIFF0 ~ C(DISC) + BMI0 + C(DISC)*BMI0'

ancova_lm(formula)


F stats
                     sum_sq    df           F        PR(>F)
Intercept      76108.726867     1  448.687005  2.100116e-93
C(DISC)          446.538345     1    2.632496  1.047937e-01
BMI0           75883.623957     1  447.359946  3.768221e-93
C(DISC):BMI0     223.619425     1    1.318313  2.509793e-01
Residual      552639.656152  3258         NaN           NaN


In [112]:
# 1, 12, 13, 123
# 2, 23
# 3

disc_var = 'C(DISC) + C(DISC)*C(RACE) + C(DISC)*BMI0 + C(DISC)*C(RACE)*BMI0 + '
race_var = 'C(RACE) + C(RACE)*BMI0 + '

formula = 'LV_AGE_DIFF0 ~ ' + disc_var + race_var + 'BMI0'

ancova_lm(formula)


F stats
                             sum_sq    df          F        PR(>F)
Intercept               5892.039974     1  36.187406  1.991854e-09
C(DISC)                  243.366781     1   1.494697  2.215779e-01
C(RACE)                 3920.377354     3   8.025986  2.506674e-05
C(DISC):C(RACE)          339.836205     3   0.695729  5.545801e-01
BMI0                    3287.963089     1  20.193831  7.242187e-06
C(DISC):BMI0             428.649788     1   2.632658  1.047835e-01
C(RACE):BMI0            1542.702299     3   3.158295  2.373324e-02
C(DISC):C(RACE):BMI0     593.097948     3   1.214219  3.029085e-01
Residual              528514.299446  3246        NaN           NaN


In [113]:
# disc
# race
# degree
# bmi

disc_var1 = 'C(DISC) + C(DISC)*C(RACE) + C(DISC)*C(DEGREE) + C(DISC)*BMI0 + '
disc_var2 = 'C(DISC)*C(RACE)*C(DEGREE) + C(DISC)*C(RACE)*BMI0 + C(DISC)*C(DEGREE)*BMI0 + '
disc_var3 = 'C(DISC)*C(RACE)*C(DEGREE)*BMI0 + '

disc_var = disc_var1 + disc_var2 + disc_var3

race_var1 = 'C(RACE) + C(RACE)*C(DEGREE) + C(RACE)*BMI0 + '
race_var2 = 'C(RACE)*C(DEGREE)*BMI0 + '

race_var = race_var1 + race_var2

degree_var = 'C(DEGREE) + C(DEGREE)*BMI0 + '

formula = 'LV_AGE_DIFF0 ~ ' + disc_var + race_var + degree_var + 'BMI0'

ancova_lm(formula)


F stats
                                       sum_sq    df         F    PR(>F)
Intercept                          411.317199     1  2.607398  0.106465
C(DISC)                             89.258806     1  0.565824  0.451979
C(RACE)                            800.251556     3  1.690969  0.166766
C(DEGREE)                           48.495582     4  0.076855  0.989325
C(DISC):C(RACE)                     92.414877     3  0.195277  0.899660
C(DISC):C(DEGREE)                  484.429702     4  0.767717  0.546122
C(RACE):C(DEGREE)                 2369.714179    12  1.251830  0.240882
C(DISC):C(RACE):C(DEGREE)          888.765874    12  0.469501  0.933226
BMI0                               223.781567     1  1.418583  0.233726
C(DISC):BMI0                        58.255345     1  0.369289  0.543435
C(RACE):BMI0                       727.431086     3  1.537096  0.202799
C(DEGREE):BMI0                      51.585673     4  0.081752  0.987999
C(DISC):C(RACE):BMI0                34.721006     3  0.

In [114]:
#disc two way interactions

disc_int = 'C(DISC)*C(RACE) + C(DISC)*C(DEGREE) + C(DISC)*C(INCOME0) + C(DISC)*C(CVD) + C(DISC)*BMI0'


formula = 'LV_AGE_DIFF0 ~ C(DISC) + C(RACE) + C(DEGREE) + C(INCOME0) + C(CVD) + BMI0 + ' + disc_int

ancova_lm(formula)


F stats
                           sum_sq    df           F        PR(>F)
Intercept            24835.457334     1  157.253132  2.936707e-35
C(DISC)               2439.867096     1   15.448749  8.654707e-05
C(RACE)               6882.137125     3   14.525437  2.133383e-09
C(DEGREE)             5703.446692     4    9.028270  2.999554e-07
C(INCOME0)             286.150000     3    0.603948  6.124102e-01
C(CVD)                2002.494315     1   12.679392  3.750313e-04
C(DISC):C(RACE)       2185.793638     3    4.613336  3.171617e-03
C(DISC):C(DEGREE)     1604.234849     4    2.539423  3.805606e-02
C(DISC):C(INCOME0)     560.869225     3    1.183770  3.143461e-01
C(DISC):C(CVD)         269.209139     1    1.704578  1.917831e-01
BMI0                 41615.088936     1  263.498392  5.003531e-57
C(DISC):BMI0            45.561576     1    0.288487  5.912288e-01
Residual            511071.155584  3236         NaN           NaN


In [115]:
#disc two way interactions - complete?
disc_int = '+ C(DISC)*C(RACE) + C(DISC)*C(DEGREE)'
formula = 'LV_AGE_DIFF0 ~ C(DISC) + C(RACE) + C(DEGREE) + C(INCOME0) + C(CVD) + BMI0' + disc_int

ancova_lm(formula)


F stats
                          sum_sq    df           F        PR(>F)
Intercept           27460.454554     1  173.820086  1.055710e-38
C(DISC)              2800.855879     1   17.728949  2.616478e-05
C(RACE)              6742.995810     3   14.227345  3.283732e-09
C(DEGREE)            5569.818593     4    8.814005  4.480374e-07
C(INCOME0)           1359.965182     3    2.869451  3.513765e-02
C(CVD)               5245.651810     1   33.204099  9.074440e-09
C(DISC):C(RACE)      2023.333256     3    4.269120  5.127560e-03
C(DISC):C(DEGREE)    2021.658020     4    3.199189  1.242647e-02
BMI0                63996.555369     1  405.087495  5.460317e-85
Residual           512019.843862  3241         NaN           NaN


In [116]:
inter = '+ C(RACE)*C(DEGREE) + C(RACE)*C(CVD) + C(RACE)*BMI0 + C(DEGREE)*C(CVD) + C(DEGREE)*BMI0'

formula = 'LV_AGE_DIFF0 ~ C(DISC) + C(RACE) + C(DEGREE) + C(INCOME0) + C(CVD) + BMI0' + inter
ancova_lm(formula)


F stats
                          sum_sq    df          F        PR(>F)
Intercept            3964.529607     1  25.431886  4.836086e-07
C(DISC)               289.305352     1   1.855852  1.731995e-01
C(RACE)              4221.803020     3   9.027419  5.963856e-06
C(DEGREE)            1115.818718     4   1.789454  1.280657e-01
C(INCOME0)           1044.031707     3   2.232438  8.241221e-02
C(CVD)                 99.715502     1   0.639661  4.238927e-01
C(RACE):C(DEGREE)    4701.182182    12   2.513117  2.731448e-03
C(RACE):C(CVD)       1730.181943     3   3.699623  1.129317e-02
C(DEGREE):C(CVD)     2151.133856     4   3.449803  8.052339e-03
BMI0                 3027.408468     1  19.420389  1.083023e-05
C(RACE):BMI0         2374.084082     3   5.076469  1.656678e-03
C(DEGREE):BMI0       1916.981618     4   3.074290  1.540230e-02
Residual           502271.610368  3222        NaN           NaN


In [117]:
inter = '+ C(RACE)*C(DEGREE) + C(RACE)*C(CVD) + C(RACE)*BMI0 + C(DEGREE)*C(CVD) + C(DEGREE)*BMI0'

formula = 'LV_AGE_DIFF0 ~ C(DISC) + C(RACE) + C(DEGREE) + C(INCOME0) + C(CVD) + BMI0' + inter
ancova_lm(formula)


F stats
                          sum_sq    df          F        PR(>F)
Intercept            3964.529607     1  25.431886  4.836086e-07
C(DISC)               289.305352     1   1.855852  1.731995e-01
C(RACE)              4221.803020     3   9.027419  5.963856e-06
C(DEGREE)            1115.818718     4   1.789454  1.280657e-01
C(INCOME0)           1044.031707     3   2.232438  8.241221e-02
C(CVD)                 99.715502     1   0.639661  4.238927e-01
C(RACE):C(DEGREE)    4701.182182    12   2.513117  2.731448e-03
C(RACE):C(CVD)       1730.181943     3   3.699623  1.129317e-02
C(DEGREE):C(CVD)     2151.133856     4   3.449803  8.052339e-03
BMI0                 3027.408468     1  19.420389  1.083023e-05
C(RACE):BMI0         2374.084082     3   5.076469  1.656678e-03
C(DEGREE):BMI0       1916.981618     4   3.074290  1.540230e-02
Residual           502271.610368  3222        NaN           NaN


In [139]:
inter = '+ C(DEGREE)*C(CVD) + C(DEGREE)*BMI0'
inter2 = '+ C(DEGREE)*C(INCOME0)*C(CVD)*BMI0'

formula = 'LV_AGE_DIFF0 ~ C(DISC) + C(RACE) + C(DEGREE) + C(INCOME0) + C(CVD) + BMI0'  + inter2
ancova_lm(formula)


F stats
                                         sum_sq    df          F        PR(>F)
Intercept                            877.711493     1   5.588046  1.814316e-02
C(DISC)                              293.379012     1   1.867829  1.718203e-01
C(RACE)                             8692.318847     3  18.446865  7.304884e-12
C(DEGREE)                            143.879126     4   0.229006  9.222284e-01
C(INCOME0)                           666.723170     3   1.414922  2.364307e-01
C(CVD)                               552.686836     1   3.518741  6.076992e-02
C(DEGREE):C(INCOME0)                1454.995215    12   0.771949  6.801920e-01
C(DEGREE):C(CVD)                    1292.106784     4   2.056585  8.390067e-02
C(INCOME0):C(CVD)                    491.874959     3   1.043859  3.719348e-01
C(DEGREE):C(INCOME0):C(CVD)         3243.238858    12   1.720703  5.632553e-02
BMI0                                1311.448792     1   8.349482  3.884039e-03
C(DEGREE):BMI0                       243.23