# Workbook follows Section 3.2 of [Faraway 2002](https://cran.r-project.org/doc/contrib/Faraway-PRA.pdf)

Economic data taken from [the accompanying R package](https://cran.r-project.org/web/packages/faraway/)

Originally from Belsley, Kuh, and Welsch (1980)

In [40]:
# global change of figure sizes
# following: http://blog.revolutionanalytics.com/2015/09/resizing-plots-in-the-r-kernel-for-jupyter-notebooks.html

library(repr)

options(repr.plot.width = 4, repr.plot.height = 3)

**3.2.1. F test on all predictors.** test against null hypothesis that $\left\{\beta_i\right\} = 0$

In [41]:
data(savings)
savings
# sr: aggregate personal saving / disposable income
# pop15(75): % of population under 15 (75)
# dpi: per-capita disposable income
# ddpi: % change in dpi

Unnamed: 0,sr,pop15,pop75,dpi,ddpi
Australia,11.43,29.35,2.87,2329.68,2.87
Austria,12.07,23.32,4.41,1507.99,3.93
Belgium,13.17,23.8,4.43,2108.47,3.82
Bolivia,5.75,41.89,1.67,189.13,0.22
Brazil,12.88,42.19,0.83,728.47,4.56
Canada,8.79,31.72,2.85,2982.88,2.43
Chile,0.6,39.74,1.34,662.86,2.67
China,11.9,44.75,0.67,289.52,6.51
Colombia,4.98,46.64,1.06,276.65,3.08
Costa Rica,10.78,47.64,1.14,471.24,2.8


In [42]:
summary(savings) # summary statistics

       sr             pop15           pop75            dpi         
 Min.   : 0.600   Min.   :21.44   Min.   :0.560   Min.   :  88.94  
 1st Qu.: 6.970   1st Qu.:26.21   1st Qu.:1.125   1st Qu.: 288.21  
 Median :10.510   Median :32.58   Median :2.175   Median : 695.66  
 Mean   : 9.671   Mean   :35.09   Mean   :2.293   Mean   :1106.76  
 3rd Qu.:12.617   3rd Qu.:44.06   3rd Qu.:3.325   3rd Qu.:1795.62  
 Max.   :21.100   Max.   :47.64   Max.   :4.700   Max.   :4001.89  
      ddpi       
 Min.   : 0.220  
 1st Qu.: 2.002  
 Median : 3.000  
 Mean   : 3.758  
 3rd Qu.: 4.478  
 Max.   :16.710  

In [43]:
# fitting linear model
gfit <- lm(sr ~ pop15 + pop75 + dpi + ddpi, data = savings)

In [44]:
summary(gfit)


Call:
lm(formula = sr ~ pop15 + pop75 + dpi + ddpi, data = savings)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.2422 -2.6857 -0.2488  2.4280  9.7509 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 28.5660865  7.3545161   3.884 0.000334 ***
pop15       -0.4611931  0.1446422  -3.189 0.002603 ** 
pop75       -1.6914977  1.0835989  -1.561 0.125530    
dpi         -0.0003369  0.0009311  -0.362 0.719173    
ddpi         0.4096949  0.1961971   2.088 0.042471 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.803 on 45 degrees of freedom
Multiple R-squared:  0.3385,	Adjusted R-squared:  0.2797 
F-statistic: 5.756 on 4 and 45 DF,  p-value: 0.0007904


In [45]:
# computing F-statistic and p-value 'by hand'
RSS_alt <- sum(gfit$residuals^2)
RSS_nul <- sum((savings$sr - mean(savings$sr))^2)

F_statistic <- ((RSS_nul - RSS_alt) / 4) / (RSS_alt / 45)
F_statistic
1 - pf(F_statistic, 4, 45)

**3.2.2. F test on one predictors.** test against null hypothesis that one predictor $\beta_i = 0$

In [33]:
names(savings)

In [36]:
g2 <- lm(sr ~ pop75 + dpi + ddpi, data = savings)
summary(g2)


Call:
lm(formula = sr ~ pop75 + dpi + ddpi, data = savings)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.0577 -3.2144  0.1687  2.4260 10.0763 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 5.4874944  1.4276619   3.844  0.00037 ***
pop75       0.9528574  0.7637455   1.248  0.21849    
dpi         0.0001972  0.0010030   0.197  0.84499    
ddpi        0.4737951  0.2137272   2.217  0.03162 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.164 on 46 degrees of freedom
Multiple R-squared:  0.189,	Adjusted R-squared:  0.1361 
F-statistic: 3.573 on 3 and 46 DF,  p-value: 0.02093


In [39]:
# compare two models (ANOVA)
RSS_nul <- sum(g2$res^2)
F_statistic <- ((RSS_nul - RSS_alt) / 1) / (RSS_alt / 45)
RSS_nul
F_statistic
1 - pf(F_statistic, 1, 45)

In [38]:
# alternative method for ANOVA
anova(g2, gfit)

Res.Df,RSS,Df,Sum of Sq,F,Pr(>F)
46,797.7249,,,,
45,650.713,1.0,147.0119,10.16659,0.002603019


In [None]:
# can use a t-statistic for each predictor, which is
# beta_hat / SE(beta_hat)
# note that pop15 has a t-statistic of -0.46 / 0.14 ~ -3.189
# and that the corresponding probability (t-distribution, n-p df)
# is 0.0026
#
# note that, here,
# F-statistic = (t-statistic)**2
# so approaches are identical