In [3]:
---
title: "Chapter 3 - Linear Regression"
author: "Dan"
date: "26 January 2018"
output: html_document
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(MASS)
data=Boston
names(Boston)

## We will seek to predict medv (Median Value of houses in that suburb) using 13 predictors. ?Boston for details 
```

```{r}
lm.fit <- lm(medv ~ lstat, data = Boston)
summary(lm.fit)
```
```{r}
names(lm.fit)
```

```{r}
confint(lm.fit) # to find 95% confidence intervals, level = 0.95 by default
```

```{r}
predict(lm.fit, data.frame(lstat=c(5,10,15)), interval ="confidence") # predicting the value of medv using the lm.fit model. Output data frame where lstat = 5, 10, 15
```
```{r}
predict(lm.fit, data.frame(lstat=c(5,10,15)), interval = "prediction") # Prediction intervals include the irreducible error so are wider than confidence intervals
```
```{r}
#Plotting the linear regression line
attach(Boston)
plot(lstat, medv)
abline(lm.fit, col="red")

plot(lstat, medv)
abline(lm.fit, lwd = 3) # line width

plot(lstat, medv, pch=20) # plot character, changes shape
plot(lstat, medv, pch="*") # can be any character
plot (1:20, 1:20, pch = 1:20) # shows all characters

## Lets look at some plots from the lm() function ##
## can do plot(lm.fit) to cycle through some relevant plots

plot(lm.fit)

# Better to look at them all at once #

par(mfrow=c(2,2)) # graphics parameters, show 2x2
plot(lm.fit)

# We can compute the residuals from a linear regression fit using the residuals() function. The function rstudent() will return the studentized residuals, and we can use this function to plot the residuals against fitted values

par(mfrow=c(1,1))
plot(predict(lm.fit), residuals(lm.fit))
plot(predict(lm.fit), rstudent(lm.fit))

## On the basis of the residual plots, there is some evidence of non-linearity. Leverage statistics can be computed for any number of predictors using the hatvalues() function.

plot(hatvalues(lm.fit))
which.max(hatvalues(lm.fit))

## Multiple Regression ##

lm.fit <- lm(medv ~ lstat + age, data = Boston)
summary(lm.fit)

## To regress against all reminaing variables, we can use a dot to mean all ##

lm.fit <- lm(medv ~ ., data = Boston)
summary(lm.fit)

## We can access individual components of a summary object by name (?lm.summary to see what is available)

# car package has the vif() function 

library(car)
vif(lm.fit) # Max value is 9, moderate collinearity but can be ignored < 10

## Running lm with all but one syntax, age had high p-value ##

lm.fit <- lm(medv ~ .-age, data = Boston)
summary(lm.fit)

## Interaction term syntax ##
# lstat:black includes an interaction term (lstat x black)
# lstat*black includes lstat, black and the interaction

summary(lm(medv ~ lstat*age, data = Boston))

# Non-linear transformations #
# Using an X squared transformation needs to be wrapped in the function I() because the hat has a special function in R #

lm.fit2 <- lm(medv ~ lstat + I(lstat^2))
summary(lm.fit2)

## small p-value associated with the quadratic term suggests that it leads to an improved model. We can use the anova() function to further quantify the extent to which the quadratic fit is superior to the linear fit

lm.fit <- lm(medv ~ lstat, data = Boston)
anova(lm.fit, lm.fit2) # Anova performs a hypothesis test comparing the two models. H0 = the two models fit the data equally well. F-stat of 135 and p-value near 0 provides clear evidence that model 2 containing the polynomial is superior. 

par(mfrow=c(2,2))
plot(lm.fit5)

# Now residuals vs fitted has no discernible pattern
# To take higher polynomials we can use the poly() function within lm()

lm.fit5 <- lm(medv ~ poly(lstat,5))
summary(lm.fit5) # low p-values for all, model fit keeps improving up to the 5th order polynomial. Adjusted Rsquared = 0.679
summary(lm.fit) # adjusted Rsquared = 0.543

## Log transformation

summary(lm(medv ~ log(rm), data = Boston))

## Qualitative Predictors ##

library(ISLR)
data(Carseats)

carseats <- Carseats

attach(carseats)

## Attempting to predict sales in 400 locations based on a numbe of predictors.

?Carseats

# Variable ShelveLoc measures shelving location in the store as Bad, Good or Medium

levels(ShelveLoc)

# Given a qualitative variable such as ShelveLoc, R generates dummy variables automatically. Below we fit a simple multiple regression model that includes some interaction terms

lm.fit <- lm(Sales ~ .+Income:Advertising+Price:Age, data = carseats)
summary(lm.fit) # Coefficients for ShelveGood and ShelveMedium suggest tha effects here are strong

mean(carseats$Sales) 
1.01/7.49  # RSE of 1.01 on mean sales of 7.49 = 13.5%

# Call contrasts() function on the variable to see the coding that R uses

contrasts(ShelveLoc)

## Applied Exercises ##

data(Auto)
attach(Auto)
# 8a

lm.fit <- lm(mpg ~ horsepower)
summary(lm.fit)



## There is a relationship between the predictor and the response
## Relationship is pretty strong, Rsquared = 0.605

mean(mpg)

# %age error = RSE/mean of response = 4.906/23.446 = 20.9%
# relationship is negative

predict(lm.fit, data.frame(horsepower = c(98)), interval = "prediction")
predict(lm.fit, data.frame(horsepower = c(98)), interval = "confidence")

# predicted mpg = 24.467
# prediction interval (14.809,34.124)
# 95% confidence interval (23.97,24.96)

plot(mpg ~ horsepower)
abline(lm.fit, col="red", lwd = 1.5)

plot(lm.fit)
  
# Residuals plot suggests highly non-linear relationship

## 9
data(Auto)

plot(Auto)

library(dplyr)
names(Auto)

Auto_no_name <- Auto[,1:8]
cormat <- cor(Auto_no_name)

summary(lm(mpg ~ .-name, data=Auto))

# There is a relationship between the predictors and the response - F test p-value <0.05

# displacement, weight, year, origin

?Auto
table(year)

# suggests that for every year older the model is, the mpg increases by 0.75

summary(lm(mpg ~ .-name, data = Auto))

plot(lm.fit)

# Residual plot has a U-shape so relationship is likely non-linear

#data points 323, 327 are outliers
#data point 14 has very high leverage, car has high horsepower

Auto[14,]

lm.fit <- lm(mpg ~ year + origin + displacement*weight, data = Auto)
summary(lm.fit)

# adding displacement and weight interaction took R2 from 0.81 to 0.85 and interaction is statistically significant

lm.fit <- lm(mpg ~ year + displacement*weight+log(year), data = Auto)
summary(lm.fit)
plot(lm.fit)

Auto2 <- Auto[-14,]


lm.fit <- lm(mpg ~ year + displacement*weight+log(year), data = Auto2)
summary(lm.fit)
plot(lm.fit)

#log year is statistically significant

lm.fit <- lm(mpg ~ poly(displacement,3)+weight+year+origin, data = Auto2)
summary(lm.fit)

#Q10

data("Carseats")
attach(Carseats)

lm.fit <- lm(Sales ~ Price + Urban + US, data = Carseats)

summary(lm.fit)
?Carseats

# Price - sales drop by 54 (sales are in thousands) for every dollar increase in price

# UrbanYes - sales decrese by 21 for urban locations

#USYes - sales increase by 1201 when a store is in the US

# Y = 13.04 - 0.05 * Price - 0.02*UrbanYes + 1.2 * USYes

# Reject the null hypothesis for Urban

lm.fit <- lm(Sales ~ Price + US)
mean(Sales)
summary(lm.fit)

#Model A has a RSE of 2.472 so %age error = 2.472/7.5 = 33% and R^2 = 0.2335

#Model B has a RSE of 2.469 so %age error = 33% and R^2 = 0.2345

# Model B is a slightly better fit

confint(lm.fit, level = 0.95)

# Confidence intervals
#Price = (-0.065, -0.044)
#USYes = (0.692, 1.708)

par(mfrow=c(1,1))
plot(lm.fit)
library(car)
plot(hatvalues(lm.fit))
which.max(hatvalues(lm.fit))

# Leverage plot shows one value with leverage of over 0.04. Average leverage for the data set = (p+1/n) = 3/400 = 0.0075 so one data point of 0.04 leverage could be a problem.

## Q11

set.seed(1)
x <- rnorm(100)
y <- 2*x+rnorm(100)
  

lm.fit <- lm(y ~ x+0) # +0 means without an intercept
summary(lm.fit)

# Coefficient estimate = 1.9939
# RSE = 0.9586
# t-stat = 18.73
# p-value = <2e-16

# Suggests that the coefficient x is highly significant and that a one unit increase in x increases the response y on average by 1.9939. We reject the null that Beta = 0

lm.fit <- lm(x~y+0)
summary (lm.fit)

# Coefficient estimate = 0.39111
# RSE = 0.4246
# t-stat = 18.73
# p-value = <2e-16

# Suggests that the coefficient estimate of y is highly significant, when y increases, espected increase in x of 0.3911. We reject the null that Beta = 0

# Q13

set.seed(1)
x <- rnorm(100, mean = 0, sd = 1)
eps <- rnorm(100, mean = 0, sd = 0.25)
Y <- -1 + 0.5*(x) + eps*(x)
length(Y)
 
#length = 100
# Beta0 = -1, Beta1 = 0.5

plot(x, Y) # relationship looks reasonably linear

lm.fit <- lm(Y ~ x)
summary(lm.fit)

# The model shows that x is statistically significant, Beta values are close at -1 and 0.548

abline(lm.fit, lty=1) # least squares regression line
abline(-1, 0.5, col="red", lty=2) # population regression line


lm.fit2 <- lm(Y ~ x+I(x^2))
summary(lm.fit2)
abline(lm.fit2, col="green", lty=3)

legend("topleft", legend = c("Least Squares","Population","Polynomial"), col=c("black","red","green"), lty=1:3)

anova(lm.fit, lm.fit2) # ANOVA gives high p-value so model fit is not improved

### Again with less noise in the data, lowever the variance of x

set.seed(1)
x <- rnorm(100, mean = 0, sd = 1)
eps <- rnorm(100, mean = 0, sd = 0.05)
Y <- -1 + 0.5*(x) + eps*(x)
length(Y)
 
#length = 100
# Beta0 = -1, Beta1 = 0.5

plot(x, Y) # relationship looks reasonably linear

lm.fit <- lm(Y ~ x)
summary(lm.fit)

# The model shows that x is statistically significant, Beta values are close at -1 and 0.548

abline(lm.fit, lty=1) # least squares regression line
abline(-1, 0.5, col="red", lty=2) # population regression line


lm.fit2 <- lm(Y ~ x+I(x^2))
summary(lm.fit2)
abline(lm.fit2, col="green", lty=3)

legend("topleft", legend = c("Least Squares","Population","Polynomial"), col=c("black","red","green"), lty=1:3)

anova(lm.fit, lm.fit2)

# Here the polynomial is better
# Here the polynomial is better too

#Confints

confint(lm.fit)

#Noisy (0.3848,0.5842)
#Original (0.4478, 0.5526)
#Less Noisy (0.4974, 0.5189)

# More noise = wider confidence intervals, makes sense with higher variance of error terms

# Q14

set.seed(1)
x1 <- runif(100)
x2 <- 0.5*x1+rnorm(100)/10
y <- 2+2*x1+0.3*x2+rnorm(100)


## Y = 2 + 2(X1) + 0.3(X2) + e
## B0 = 2, B1 = 2, B2 = 0.3

x1 <- c(x1, 0.1)
x2 <- c(x2, 0.8)
y <- c(y,6)

plot(x1,x2) # weak positive correlation

lm.fit <- lm(y ~ x1 + x2)
summary(lm.fit)
plot(lm.fit)

# x2 has very large standard error and is not significantly different from zero

# B0 = 2.0113, B1 = 2.2993, B2 = -0.2352

# We can reject the null that x1 = 0 but not that x2 = 0

lm.fit2 <- lm(y~x1)
summary(lm.fit2)
plot(lm.fit2)


# We can reject the null that x1 is zero

lm.fit3 <- lm(y~x2)
summary(lm.fit3)
plot(lm.fit3)

# We can reject the null that x2 is zero

# These results do not contradict each other. Without the presence of other predictors, both B1 and B2 are statistically significant. In the presence of other predictors, B2 is no longer statistically significant

#In the both model, the additions make them both statistically significant, data point 101 is high leverage, > 0.3 when average leverage = (p+1)/n = 4/101 = 0.039

#Second model, x1 is still significant, data point 101 is an outlier or high leverage

#Third model, x2 is still significant, data point 101 is high leverage

plot(x1, y)
plot(x2, y)

#Q15

data(Boston)

names(Boston)

lm1 <- lm(crim ~ zn)
summary(lm1) # zn in statistically significant

summary(lm(crim ~ indus)) # sig
summary(lm(crim ~ chas))
summary(lm(crim ~ nox)) # sig
summary(lm(crim ~ rm)) # sig
summary(lm(crim ~ age)) # sig
summary(lm(crim ~ dis)) # sig
summary(lm(crim ~ rad)) # sig
summary(lm(crim ~ tax)) # sig
summary(lm(crim ~ ptratio)) #all rest are sig
summary(lm(crim ~ black))
summary(lm(crim ~ lstat))
summary(lm(crim ~ medv))

lm.fit <- lm(crim ~. , data = Boston)
summary(lm.fit)

# Reject the null for everything with a significance code


```

ERROR: Error in parse(text = x, srcfile = src): attempt to use zero-length variable name
