In [1]:
library(tidyverse)
library(olsrr)
library(car)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1     ✔ purrr   0.2.4
✔ tibble  1.4.2     ✔ dplyr   0.7.4
✔ tidyr   0.7.2     ✔ stringr 1.2.0
✔ readr   1.1.1     ✔ forcats 0.2.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attaching package: ‘car’

The following object is masked from ‘package:dplyr’:

    recode

The following object is masked from ‘package:purrr’:

    some



# Import data

In [2]:
workdir <- "/media/clint/Data/GoogleDrive/DellXPS/2018_DukeSpring/BIOS705_ApplyBiostatMethod_II/"

In [3]:
dat_sbp <- read_csv(file.path(workdir, "quet.csv"), col_names = TRUE)

Parsed with column specification:
cols(
  SBP = col_integer(),
  QUET = col_double(),
  AGE = col_integer(),
  SMK = col_integer()
)


In [4]:
dat_sbp

SBP,QUET,AGE,SMK
135,2.876,45,0
122,3.251,41,0
130,3.1,49,0
148,3.768,52,0
146,2.979,54,1
129,2.79,47,1
162,3.668,60,1
160,3.612,48,1
144,2.368,44,1
180,4.637,64,1


# SST = SSR + SSE

In [5]:
fit <- lm(SBP ~ ., data = dat_sbp)

In [6]:
y    <- dat_sbp$SBP
yhat <- fit$fitted.values
ybar <- mean(y)

In [7]:
sst <- crossprod(y - ybar)[1]
print(sst)

[1] 6425.969


In [8]:
ssr <- crossprod(yhat - ybar)[1]
print(ssr)

[1] 4889.826


In [9]:
sse <- crossprod(y - yhat)[1]
print(sse)

[1] 1536.143


In [10]:
print(sst)
print(ssr + sse)

[1] 6425.969
[1] 6425.969


# ANOVA Table

In [11]:
anova_lm <- function(fit){
    
    require("tibble")
    
    y     <- fit$fitted.values + fit$residuals
    yhat  <- fit$fitted.values
    ybar  <- mean(y)
    err   <- fit$residuals
    
    SST   <- crossprod(y - ybar)[1]
    SSE   <- crossprod(err)[1]
    SSR   <- SST - SSE
    
    F_val <- SSR / SSE

    df_t <- length(y) - 1
    df_e <- fit$df.residual
    df_r <- df_t - df_e
    
    res <- tibble(
        Source = c("Model", "Error", "Total"),
        DF = c(df_r, df_e, df_t),
        Sum_Sq = c(SSR, SSE, SST),
        Mean_Sq = Sum_Sq / DF,
        F_val = c((SSR / df_r) / (SSE / df_e), NA, NA))
    return(res)
}

In [12]:
# My calculation
anova_lm(fit)

Source,DF,Sum_Sq,Mean_Sq,F_val
Model,3,4889.826,1629.9419,29.70972
Error,28,1536.143,54.86225,
Total,31,6425.969,207.28931,


In [17]:
# from package
ols_regress(fit)

                        Model Summary                          
--------------------------------------------------------------
R                       0.872       RMSE                7.407 
R-Squared               0.761       Coef. Var           5.125 
Adj. R-Squared          0.735       MSE                54.862 
Pred R-Squared          0.662       MAE                 5.483 
--------------------------------------------------------------
 RMSE: Root Mean Square Error 
 MSE: Mean Square Error 
 MAE: Mean Absolute Error 

                               ANOVA                                 
--------------------------------------------------------------------
                Sum of                                              
               Squares        DF    Mean Square      F         Sig. 
--------------------------------------------------------------------
Regression    4889.826         3       1629.942     29.71    0.0000 
Residual      1536.143        28         54.862            

# SSI

In [32]:
get_ss <- function(fit){
    res <- list()
    res$y     <- fit$fitted.values + fit$residuals
    res$yhat  <- fit$fitted.values
    res$ybar  <- mean(res$y)
    res$err   <- fit$residuals
    
    res$SST   <- crossprod(res$y - res$ybar)[1]
    res$SSE   <- crossprod(res$err)[1]
    res$SSR   <- res$SST - res$SSE
    
    res$n    <- length(res$y)
    res$df_t <- res$n - 1
    res$df_e <- fit$df.residual
    res$df_r <- res$df_t - res$df_e
    
    res$MST   <- res$SST / res$df_t
    res$MSR   <- res$SSR / res$df_r
    res$MSE   <- res$SSE / res$df_e
    
    res$F_val <- res$MSR / res$MSE
    
    return(res)
}

partial_lm <- function(fit_full, fit_reduce, k){
    ss_full   <- get_ss(fit_full)
    ss_reduce <- get_ss(fit_reduce)
    ss_partial <- list()
    ss_partial$Sum_Sq  <- ss_full$SSR - ss_reduce$SSR
    ss_partial$Df      <- k
    ss_partial$F_value <- ss_partial$Sum_Sq / k / ss_full$MSE
    
    return(ss_partial)
}

In [59]:
library(gtools)
k = 1
var_y <- "SBP"
var_x <- c("QUET", "AGE", "SMK")
n <- length(var_x)
dat <- dat_sbp

comb_var <- combinations(
        n = length(vars), 
        r = n - k, 
        v = var_x)
print(comb_var)

comb_var <- apply(comb_var, 1, function(x){
    str_var <- paste(x, collapse = "+")
    str_for <- paste(var_y, "~", str_var)
    return(str_for)
})
comb_var <- unlist(comb_var)
print(comb_var)

comb_formula <- lapply(comb_var, as.formula)
comb_lm <- lapply(comb_formula, function(one_formula){
    lm(one_formula, data = dat)
})
names(comb_lm) <- comb_var
print(comb_lm)

str_var <- paste(var_x, collapse = "+")
str_for <- paste(var_y, "~", str_var)
str_for <- as.formula(str_for)
fit_full <- lm(str_for, data = dat)
print(fit_full)

res <- list()
for (idx in seq_along(comb_lm)) {
    fit_reduce   <- comb_lm[[idx]]
    name_lm      <- names(comb_lm)[idx]
    res[[name_lm]] <- unlist(partial_lm(fit_full, fit_reduce, k))
}
print(t(as.tibble(res)))

     [,1]   [,2]  
[1,] "AGE"  "QUET"
[2,] "AGE"  "SMK" 
[3,] "QUET" "SMK" 
[1] "SBP ~ AGE+QUET" "SBP ~ AGE+SMK"  "SBP ~ QUET+SMK"
$`SBP ~ AGE+QUET`

Call:
lm(formula = one_formula, data = dat)

Coefficients:
(Intercept)          AGE         QUET  
     55.323        1.045        9.751  


$`SBP ~ AGE+SMK`

Call:
lm(formula = one_formula, data = dat)

Coefficients:
(Intercept)          AGE          SMK  
     48.050        1.709       10.294  


$`SBP ~ QUET+SMK`

Call:
lm(formula = one_formula, data = dat)

Coefficients:
(Intercept)         QUET          SMK  
     63.876       22.116        8.571  



Call:
lm(formula = str_for, data = dat)

Coefficients:
(Intercept)         QUET          AGE          SMK  
     45.103        8.592        1.213        9.946  

                   [,1] [,2]      [,3]
SBP ~ AGE+QUET 769.2335    1 14.021179
SBP ~ AGE+SMK  200.1415    1  3.648072
SBP ~ QUET+SMK 769.4592    1 14.025294


In [33]:
fit_full   <- lm(SBP ~ QUET + AGE + SMK, data = dat_sbp)
fit_reduce <- lm(SBP ~ QUET + AGE,       data = dat_sbp)
partial_lm(fit_full, fit_reduce, k = 1)

In [14]:
Anova(fit, type = "II")

Unnamed: 0,Sum Sq,Df,F value,Pr(>F)
QUET,200.1415,1,3.648072,0.0664267764
AGE,769.4592,1,14.025294,0.0008288266
SMK,769.2335,1,14.021179,0.000830032
Residuals,1536.1431,28,,


In [30]:
dat_sbp

SBP,QUET,AGE,SMK
135,2.876,45,0
122,3.251,41,0
130,3.1,49,0
148,3.768,52,0
146,2.979,54,1
129,2.79,47,1
162,3.668,60,1
160,3.612,48,1
144,2.368,44,1
180,4.637,64,1


In [28]:
partial

In [13]:
?Anova

0,1
Anova {car},R Documentation

0,1
mod,"lm, aov, glm, multinom, polr mlm, coxph, coxme, lme, mer, merMod, svyglm, rlm, or other suitable model object."
error,"for a linear model, an lm model object from which the error sum of squares and degrees of freedom are to be calculated. For F-tests for a generalized linear model, a glm object from which the dispersion is to be estimated. If not specified, mod is used."
type,"type of test, ""II"", ""III"", 2, or 3."
singular.ok,"defaults to TRUE for type-II tests, and FALSE for type-III tests (where the tests for models with aliased coefficients will not be straightforwardly interpretable); if FALSE, a model with aliased coefficients produces an error."
test.statistic,"for a generalized linear model, whether to calculate ""LR"" (likelihood-ratio), ""Wald"", or ""F"" tests; for a Cox or Cox mixed-effects model, whether to calculate ""LR"" (partial-likelihood ratio) or ""Wald"" tests; in the default case or for linear mixed models fit by lmer, whether to calculate Wald ""Chisq"" or ""F"" tests. For a multivariate linear model, the multivariate test statistic to compute — one of ""Pillai"", ""Wilks"", ""Hotelling-Lawley"", or ""Roy"", with ""Pillai"" as the default. The summary method for Anova.mlm objects permits the specification of more than one multivariate test statistic, and the default is to report all four."
error.estimate,"for F-tests for a generalized linear model, base the dispersion estimate on the Pearson residuals (""pearson"", the default); use the dispersion estimate in the model object (""dispersion""); or base the dispersion estimate on the residual deviance (""deviance""). For binomial or Poisson GLMs, where the dispersion is fixed to 1, setting error.estimate=""dispersion"" is changed to ""pearson"", with a warning."
white.adjust,"if not FALSE, the default, tests use a heteroscedasticity-corrected coefficient covariance matrix; the various values of the argument specify different corrections. See the documentation for hccm for details. If white.adjust=TRUE then the ""hc3"" correction is selected."
SSPE,"For Anova for a multivariate linear model, the error sum-of-squares-and-products matrix; if missing, will be computed from the residuals of the model; for the print method for the summary of an Anova of a multivariate linear model, whether or not to print the error SSP matrix (defaults to TRUE)."
SSP,"if TRUE (the default), print the sum-of-squares and cross-products matrix for the hypothesis and the response-transformation matrix."
error.df,"The degrees of freedom for error; if missing, will be taken from the model."
