In [9]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1     ✔ purrr   0.2.4
✔ tibble  1.4.2     ✔ dplyr   0.7.4
✔ tidyr   0.7.2     ✔ stringr 1.2.0
✔ readr   1.1.1     ✔ forcats 0.2.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


# polynomial and multicolinearity

Note that, in a polynomial regression the power terms tend to be
highly correlated and could lead to a multicollinearity problem.
Often times, the correlation between predictors can be reduced by
”centering” the variables (ie, subtracting the mean from the
variable in the polynomial regression model)

More info:  
Centering for Multicollinearity Between Main effects and Quadratic terms  
https://www.theanalysisfactor.com/centering-for-multicollinearity-between-main-effects-and-interaction-terms/

In [1]:
x1 <- c(2, 4, 4, 5, 6, 7, 7, 8, 8, 8)

In [5]:
cor(x1, x1^2)

In [6]:
x1_c <- x1 - mean(x1)

In [7]:
cor(x1_c, x1_c^2)

# Polynomial regression

In [10]:
workdir <- "/media/clint/Data/GoogleDrive/DellXPS/2018_DukeSpring/BIOS705_ApplyBiostatMethod_II/"
dat_sbp <- read_csv(file.path(workdir, "quet.csv"), col_names = TRUE)

Parsed with column specification:
cols(
  SBP = col_integer(),
  QUET = col_double(),
  AGE = col_integer(),
  SMK = col_integer()
)


In [11]:
dat_sbp

SBP,QUET,AGE,SMK
135,2.876,45,0
122,3.251,41,0
130,3.1,49,0
148,3.768,52,0
146,2.979,54,1
129,2.79,47,1
162,3.668,60,1
160,3.612,48,1
144,2.368,44,1
180,4.637,64,1


center and squared

In [45]:
dat <- dat_sbp %>% 
    mutate(AGE2 = AGE^2,
           AGEc = AGE - mean(AGE), 
           AGEc2 = AGEc^2)

In [46]:
model01 <- lm(SBP ~ AGE,          data = dat)
model02 <- lm(SBP ~ AGEc,         data = dat)
model11 <- lm(SBP ~ AGE  + AGE2,  data = dat)
model12 <- lm(SBP ~ AGEc + AGEc2, data = dat)

In [47]:
summary(model01)


Call:
lm(formula = SBP ~ AGE, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.548  -6.990  -2.481   5.765  23.892 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  59.0916    12.8163   4.611 6.98e-05 ***
AGE           1.6045     0.2387   6.721 1.89e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9.245 on 30 degrees of freedom
Multiple R-squared:  0.6009,	Adjusted R-squared:  0.5876 
F-statistic: 45.18 on 1 and 30 DF,  p-value: 1.894e-07


In [48]:
summary(model02)


Call:
lm(formula = SBP ~ AGEc, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.548  -6.990  -2.481   5.765  23.892 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 144.5312     1.6344  88.432  < 2e-16 ***
AGEc          1.6045     0.2387   6.721 1.89e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9.245 on 30 degrees of freedom
Multiple R-squared:  0.6009,	Adjusted R-squared:  0.5876 
F-statistic: 45.18 on 1 and 30 DF,  p-value: 1.894e-07


In [49]:
summary(model11)


Call:
lm(formula = SBP ~ AGE + AGE2, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-14.290  -6.248  -2.973   6.888  24.480 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept) 160.56851  102.09044   1.573    0.127
AGE          -2.24966    3.85419  -0.584    0.564
AGE2          0.03600    0.03593   1.002    0.325

Residual standard error: 9.245 on 29 degrees of freedom
Multiple R-squared:  0.6143,	Adjusted R-squared:  0.5877 
F-statistic: 23.09 on 2 and 29 DF,  p-value: 1.002e-06


In [50]:
summary(model12)


Call:
lm(formula = SBP ~ AGEc + AGEc2, data = dat)

Residuals:
    Min      1Q  Median      3Q     Max 
-14.290  -6.248  -2.973   6.888  24.480 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 142.84392    2.34671  60.870  < 2e-16 ***
AGEc          1.58395    0.23958   6.611 3.02e-07 ***
AGEc2         0.03600    0.03593   1.002    0.325    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9.245 on 29 degrees of freedom
Multiple R-squared:  0.6143,	Adjusted R-squared:  0.5877 
F-statistic: 23.09 on 2 and 29 DF,  p-value: 1.002e-06


## print out the anova table

In [24]:
anova_table <- function(fit){
    
    require("tibble")
    
    y     <- fit$fitted.values + fit$residuals
    SST   <- sum((y - mean(y))^2)
    SSE   <- sum(fit$residuals^2)
    SSR   <- SST - SSE
    F_val <- SSR / SSE

    df_t <- length(y) - 1
    df_e <- fit$df.residual
    df_r <- df_t - df_e
    
    res <- tibble(
        Source = c("Model", "Error", "Total"),
        DF = c(df_r, df_e, df_t),
        Sum_Sq = c(SSR, SSE, SST),
        Mean_Sq = Sum_Sq / DF,
        F_val = c((SSR / df_r) / (SSE / df_e), NA, NA))
    return(res)
}

In [51]:
anova_table(model01)

Source,DF,Sum_Sq,Mean_Sq,F_val
Model,1,3861.63,3861.63037,45.17692
Error,30,2564.338,85.47795,
Total,31,6425.969,207.28931,


In [52]:
anova_table(model02)

Source,DF,Sum_Sq,Mean_Sq,F_val
Model,1,3861.63,3861.63038,45.17692
Error,30,2564.338,85.47795,
Total,31,6425.969,207.28931,


In [53]:
anova_table(model11)

Source,DF,Sum_Sq,Mean_Sq,F_val
Model,2,3947.425,1973.71256,23.09326
Error,29,2478.544,85.46702,
Total,31,6425.969,207.28931,


In [54]:
anova_table(model12)

Source,DF,Sum_Sq,Mean_Sq,F_val
Model,2,3947.425,1973.71256,23.09326
Error,29,2478.544,85.46702,
Total,31,6425.969,207.28931,


VIF

In [31]:
library(car)


Attaching package: ‘car’

The following object is masked from ‘package:dplyr’:

    recode

The following object is masked from ‘package:purrr’:

    some



In [55]:
model11


Call:
lm(formula = SBP ~ AGE + AGE2, data = dat)

Coefficients:
(Intercept)          AGE         AGE2  
    160.569       -2.250        0.036  


In [56]:
vif(model11)

In [57]:
model12


Call:
lm(formula = SBP ~ AGEc + AGEc2, data = dat)

Coefficients:
(Intercept)         AGEc        AGEc2  
    142.844        1.584        0.036  


In [58]:
vif(model12)

In [29]:
res <- influence.measures(model01)
res$infmat

Unnamed: 0,dfb.1_,dfb.AGE,dffit,cov.r,cook.d,hat
1,0.1000842,-0.0911791265,0.11848745,1.1455549,0.007219577,0.076625
2,-0.1186099,0.1115689657,-0.12781197,1.2227598,0.008418227,0.13129167
3,-0.1140172,0.0952310087,-0.180566524,1.0650009,0.01645543,0.04329167
4,0.03301953,-0.0195345842,0.108763962,1.0793156,0.006044864,0.03229167
5,9.723248e-05,0.0005643513,0.005182618,1.1050408,1.389247e-05,0.031625
6,-0.1140727,0.1008127027,-0.149529403,1.1069274,0.01142014,0.05729167
7,-0.1141357,0.1323330811,0.188490188,1.0990456,0.01804029,0.061625
8,0.479625,-0.4141341436,0.680577557,0.6603051,0.1834616,0.049625
9,0.4533439,-0.4173575632,0.519244664,0.9771642,0.1272406,0.08829167
10,-0.5940296,0.6523371889,0.773404727,0.8769397,0.2644721,0.10829167
