In [24]:
# https://www.geeksforgeeks.org/decision-tree-for-regression-in-r-programming/
library(rpart)
library(caret)
library(tidyverse)
library(Metrics)


Attaching package: ‘Metrics’


The following objects are masked from ‘package:caret’:

    precision, recall




## Decision Tree
Each split is created after examining every feature and picking the best split from all the features.

In [2]:
ml_data <- read.csv('ml_predict_data.csv')

In [3]:
# assign countyname col to ml_data idx
row.names(ml_data) <- ml_data$X
# drop X col
drops <- c('X')
ml_data <- ml_data[, !(names(ml_data) %in% drops)]

In [7]:
# Split out validation dataset
# create a list of 80% of the rows in the original dataset we can use for training
set.seed(1)
validation_index <- createDataPartition(ml_data$avg_perc_change, p=0.80, list=FALSE)
# select 20% of the data for validation
testing <- ml_data[-validation_index,]
# use the remaining 80% of data to training and testing the models
training <- ml_data[validation_index,]

## Unscaled

In [8]:
# Create decision tree using regression
fit <- rpart(avg_perc_change ~ ., method = "anova", data = training)

In [40]:
# Output to be present as PNG file
png(file = "decTree.png", width = 600, height = 600)
  
# Plot
plot(fit, uniform = TRUE, main = "avg_perc_change Decision Tree using Regression")
text(fit, use.n = TRUE, cex = .7)

In [13]:
prediction <- predict(fit, testing, method = "anova")

In [22]:
y_test <- testing$avg_perc_change
residuals <- y_test - prediction

prediction_residual <- data.frame(prediction, residuals)
names(prediction_residual)[2] <- 'residuals'

act_pred_resid <- prediction_residual %>% add_column(actual = y_test)

In [27]:
rmse(act_pred_resid$actual, act_pred_resid$prediction)

## Scaled

In [28]:
# function to scale X
normalize <- function(x) {
    return((x - min(x))/(max(x) - min(x)))
}

In [29]:
y_train <- unlist(training[c('avg_perc_change')])
X_train <- data.frame(lapply(training[, 2:21], normalize))

y_test <- unlist(testing[c('avg_perc_change')])
X_test <- data.frame(lapply(testing[, 2:21], normalize))

In [34]:
# Create decision tree using regression
fit_scaled <- rpart(y_train ~ ., method = "anova", data = cbind(X_train, y_train))

In [35]:
prediction_scaled <- predict(fit_scaled, testing, method = "anova")

In [37]:
y_test <- testing$avg_perc_change
residuals <- y_test - prediction

prediction_residual_scaled <- data.frame(prediction_scaled, residuals)
names(prediction_residual_scaled)[2] <- 'residuals'

act_pred_resid_scaled <- prediction_residual_scaled %>% add_column(actual = y_test)

In [39]:
rmse(act_pred_resid_scaled$actual, act_pred_resid_scaled$prediction_scaled)