# Summary

## Random Forest
* Train the model: `rf.boston <- randomForest(medv ~ . - medv, data = Boston, subset = train)`
    * The "Mean of squared residuals" is the out of bag residuals
    * mtry is the # of predictors used in each split
    * mse is an vector, corresponds to the number of tree
* Make prediction
    * pred <- predict(fit, Boston[-train,])
* The only tuning parameter is `mtry`, i.e. the number of predictors in each split.
    
## Boosting

* Train the model: `boost.boston <- gbm(medv ~ ., data = Boston[train,], distribution = "gaussian", n.trees = 10000, shrinkage = 0.01, interaction.depth = 4)`
    * `interaction.depth` is the number of splits in each tree.
    * `distribution = "gaussian"` squared error
* When do the `summary`， print out the importance graph
* plot the relation with one variable: `plot(boost.boston, i = "lstat")`
* Make prediction: `predmat <- predict(boost.boston, newdata = Boston[-train,], n.trees = n.trees)`
    * `n.trees` can be a sequence
* A trick: `apply((predmat-medv)^2, 2, mean)`
    * Note `predmat` is a matrix but `medv` is a vector, `medv` is reused in each row.
* The tuning parameters are: # of trees, shrinkage parameter and depth.

# Boosting

In [None]:
require(gbm)
require(tidyverse)

In [None]:
boost.boston <- gbm(medv ~ ., data = Boston[train,], distribution = "gaussian", n.trees = 10000,
                   shrinkage = 0.01, interaction.depth = 4)
summary(boost.boston)

In [None]:
plot(boost.boston, i = "lstat")

In [None]:
plot(boost.boston, i = "rm")

In [None]:
n.trees <- seq(100, 10000, 100)
predmat <- predict(boost.boston, newdata = Boston[-train,], n.trees = n.trees)
dim(predmat)

In [None]:
perr <- with(Boston[-train,], apply((predmat-medv)^2, 2, mean))

In [None]:
ggplot() +
    geom_point(mapping = aes(x = n.trees, y = perr), color = "blue")

# Random forest 

In [None]:
require(randomForest)
require(MASS)

In [None]:
set.seed(101)

In [None]:
dim(Boston)

In [None]:
train <- sample(1:nrow(Boston), 300)

In [None]:
length(train)

In [None]:
rf.boston <- randomForest(medv ~ ., data = Boston, subset = train)
str(rf.boston)

In [None]:
oob.err = double(13)
test.err = double(13)
for (mtry in 1:13) {
    fit <- randomForest(medv ~ ., data = Boston, subset = train, mtry = mtry, ntree = 400)
    oob.err[mtry] <- fit$mse[400]
    pred <- predict(fit, Boston[-train,])
    test.err[mtry] <- with(Boston[-train,], mean((pred-medv)^2))
    cat(mtry, "")
}

In [None]:
matplot(1:mtry, cbind(test.err, oob.err), pch = 19, col = c("red", "blue"), type = "b")
legend("topright", legend = c("Test", "OOB"), pch = 19, col = c("red", "blue"))