Links

- Understanding Diagnostic Plots for Linear Regression Analysis
    - http://data.library.virginia.edu/diagnostic-plots/
- Package olsrr
    - https://cran.r-project.org/web/packages/olsrr/vignettes/
    - https://cran.r-project.org/web/packages/olsrr/vignettes/variable_selection.html
- Multiple (Linear) Regression
    - https://www.statmethods.net/stats/regression.html
- Variable Selection
    - http://www.stat.columbia.edu/~martin/W2024/R10.pdf

$$Y_i = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \beta_3 X_3 + ... \beta_p X_p + \epsilon$$

$$\begin{bmatrix}
1 2 3 4
\end{bmatrix}
}$$

$$\hat{Y_i} = \hat{\beta_0} + \hat{\beta_1} X_1 + \hat{\beta_2} X_2 + \hat{\beta_3} X_3 + ... \hat{\beta_p} X_p$$

In [1]:
library(tidyverse)
library(car)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1     ✔ purrr   0.2.4
✔ tibble  1.4.2     ✔ dplyr   0.7.4
✔ tidyr   0.7.2     ✔ stringr 1.2.0
✔ readr   1.1.1     ✔ forcats 0.2.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attaching package: ‘car’

The following object is masked from ‘package:dplyr’:

    recode

The following object is masked from ‘package:purrr’:

    some



# Import data

In [2]:
workdir <- "/media/clint/Data/GoogleDrive/DellXPS/2018_DukeSpring/BIOS705_ApplyBiostatMethod_II/"

In [3]:
dat_sbp <- read_csv(file.path(workdir, "quet.csv"), col_names = TRUE)

Parsed with column specification:
cols(
  SBP = col_integer(),
  QUET = col_double(),
  AGE = col_integer(),
  SMK = col_integer()
)


In [4]:
dat_sbp

SBP,QUET,AGE,SMK
135,2.876,45,0
122,3.251,41,0
130,3.1,49,0
148,3.768,52,0
146,2.979,54,1
129,2.79,47,1
162,3.668,60,1
160,3.612,48,1
144,2.368,44,1
180,4.637,64,1


# Linear Algebra

In [15]:
shift_col <- function(dat, col_name){
    col_name_all <- colnames(dat)
    col_name_remain <- col_name_all[which(!(col_name_all %in% col_name))]
    #print(col_name_all %in% col_name)
    #print(col_name_remain)
    return(dat[, c(col_name, col_name_remain)]) 
}

# Design Matrix

In [26]:
Y <- dat_sbp %>% select(SBP)
X <- dat_sbp %>% select(-SBP) %>% mutate(Intercept = 1) %>% shift_col("Intercept")
Y <- as.matrix(Y)
X <- as.matrix(X)
head(X)

Intercept,QUET,AGE,SMK
1,2.876,45,0
1,3.251,41,0
1,3.1,49,0
1,3.768,52,0
1,2.979,54,1
1,2.79,47,1


# $\hat{\beta}$

$$\hat{\beta} = (X^T X)^{-1} X^T Y$$

In [27]:
beta <- solve(t(X) %*% X) %*% t(X) %*% Y
print(beta)

                SBP
Intercept 45.103192
QUET       8.592449
AGE        1.212715
SMK        9.945568


# $\hat{Y}$

In [40]:
Y_hat <- X %*% solve(t(X) %*% X) %*% t(X) %*% Y
#print(cbind(Y_hat, fit$fitted.values)
cbind(Y_hat, fit$fitted.values)

Unnamed: 0,SBP,Unnamed: 2
1,124.3872,124.3872
2,122.7585,122.7585
3,131.1628,131.1628
4,140.5407,140.5407
5,146.1323,146.1323
6,136.0193,136.0193
7,159.3287,159.3287
8,144.295,144.295
9,128.7551,128.7551
10,172.5057,172.5057


# MSE

In [50]:
p   <- 4
mse <- crossprod(Y_hat - Y) / (length(Y) - p)
#mse <- sum((Y_hat - Y)^2) / (length(Y) - p)
print(mse)
print(mse^0.5)

         SBP
SBP 54.86225
         SBP
SBP 7.406906


In [57]:
summary(fit)$sigma

# Notes: attrbitues in lm function

In [53]:
attributes(fit)

In [56]:
attributes(summary(fit))

# ANOVA Table

I have tried to find the ANOVA table of linear regression, but I couldn't find it

In [120]:
anova_lm <- function(fit){
    
    require("tibble")
    
    y     <- fit$fitted.values + fit$residuals
    SST   <- sum((y - mean(y))^2)
    SSE   <- sum(fit$residuals^2)
    SSR   <- SST - SSE
    F_val <- SSR / SSE

    df_t <- length(y) - 1
    df_e <- fit$df.residual
    df_r <- df_t - df_e
    
    res <- tibble(
        Source = c("Model", "Error", "Total"),
        DF = c(df_r, df_e, df_t),
        Sum_Sq = c(SSR, SSE, SST),
        Mean_Sq = Sum_Sq / DF,
        F_val = c((SSR / df_r) / (SSE / df_e), NA, NA))
    return(res)
}

In [124]:
anova_lm(fit)

Source,DF,Sum_Sq,Mean_Sq,F_val
Model,3,4889.826,1629.9419,29.70972
Error,28,1536.143,54.86225,
Total,31,6425.969,207.28931,


# Type I SS

Functions like anova() and aov() in R return Type I sums of squares

In [149]:
#anova_fit <- anova(fit)
#anova_fit[-nrow(anova_fit),]
anova(fit)

Unnamed: 0,Df,Sum Sq,Mean Sq,F value,Pr(>F)
QUET,1,3537.9457,3537.94574,64.4878,9.588062e-09
AGE,1,582.6465,582.64651,10.62017,0.002932591
SMK,1,769.2335,769.23345,14.02118,0.000830032
Residuals,28,1536.1431,54.86225,,


In [195]:
anova_type1 <- function(fit){
    anova_fit <- anova(fit)
    
    err    <- anova_fit[ nrow(anova_fit),]
    reg    <- anova_fit[-nrow(anova_fit),]
    reg    <- apply(reg, 2, sum)
    reg[3] <- reg[2] / reg[1]
    reg[4] <- reg[3] / err[3]
    reg[5] <- 1 - pf(reg[["F value"]], df1 = reg[["Df"]], df2 = err[["Df"]])
    
    total    <- apply(anova_fit, 2, sum)
    total[3] <- NA
    
    res <- rbind(
        reg,
        anova(fit),
        total)
    rownames(res)[1]         <- "Regression"
    rownames(res)[nrow(res)] <- "Total"
    #print(reg)
    #print(total)
    return(res)
}

In [196]:
anova_type1(fit)

Unnamed: 0,Df,Sum Sq,Mean Sq,F value,Pr(>F)
Regression,3,4889.8257,1629.9419,29.70972,7.602273e-09
QUET,1,3537.9457,3537.94574,64.4878,9.588062e-09
AGE,1,582.6465,582.64651,10.62017,0.002932591
SMK,1,769.2335,769.23345,14.02118,0.000830032
Residuals,28,1536.1431,54.86225,,
Total,31,6425.9688,,,


# Type II SS

In [42]:
Anova(lm(SBP ~ QUET + AGE + SMK, data = sbp), type=2)

Unnamed: 0,Sum Sq,Df,F value,Pr(>F)
QUET,200.1415,1,3.648072,0.0664267764
AGE,769.4592,1,14.025294,0.0008288266
SMK,769.2335,1,14.021179,0.000830032
Residuals,1536.1431,28,,


# Type III SS

In [39]:
Anova(lm(SBP ~ QUET + AGE + SMK, data = sbp), type=3)

Unnamed: 0,Sum Sq,Df,F value,Pr(>F)
(Intercept),963.0974,1,17.554828,0.000252077
QUET,200.1415,1,3.648072,0.0664267764
AGE,769.4592,1,14.025294,0.0008288266
SMK,769.2335,1,14.021179,0.000830032
Residuals,1536.1431,28,,


In [197]:
colnames(sbp)

In [207]:
print("FULL")
anova_lm(lm(SBP~QUET+AGE+SMK, data = sbp))

Source,DF,Sum_Sq,Mean_Sq,F_val
Model,3,4889.826,1629.9419,29.70972
Error,28,1536.143,54.86225,
Total,31,6425.969,207.28931,


In [203]:
anova_lm(lm(SBP~QUET+AGE, data = sbp))

Source,DF,Sum_Sq,Mean_Sq,F_val
Model,2,4120.592,2060.29612,25.91706
Error,29,2305.377,79.49574,
Total,31,6425.969,207.28931,


In [None]:
anova_lm(lm(SBP~QUET+AGE, data = sbp))

In [None]:
anova_lm(lm(SBP~QUET+AGE, data = sbp))

In [213]:
print("QUET")
anova_lm(lm(SBP~QUET, data = sbp))

[1] "QUET"


Source,DF,Sum_Sq,Mean_Sq,F_val
Model,1,3537.946,3537.94574,36.75122
Error,30,2888.023,96.26743,
Total,31,6425.969,207.28931,


In [212]:
print("AGE")
anova_lm(lm(SBP~AGE,  data = sbp))

[1] "AGE"


Source,DF,Sum_Sq,Mean_Sq,F_val
Model,1,3861.63,3861.63037,45.17692
Error,30,2564.338,85.47795,
Total,31,6425.969,207.28931,


In [214]:
print("SMK")
anova_lm(lm(SBP~SMK, data = sbp))

[1] "SMK"


Source,DF,Sum_Sq,Mean_Sq,F_val
Model,1,393.0982,393.0982,1.954782
Error,30,6032.8706,201.0957,
Total,31,6425.9687,207.2893,
