# CART prediction function

### Packages

In [2]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

Find out more at https://www.synthpop.org.uk/

Lade n"otiges Paket: grid

Lade n"otiges Paket: mvtnorm

Lade n"otiges Paket: modeltools

Lade n"otiges Paket: stats4

Lade n"otiges Paket: strucchange

Lade n"otiges Paket: zoo


Attache Paket: 'zoo'


Die folgenden Objekte sind maskiert von 'package:base':

    as.Date, as.Date.numeric


Lade n"otiges Paket: sandwich


Attache Paket: 'dplyr'


Das folgende Objekt ist maskiert 'package:party':

    where


Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag


Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union


randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attache Paket: 'randomForest'


Das folgende Objekt ist maskiert 'package:dplyr':

    combine


Type 'citation("pROC")' for a citation.


Attache Paket: 'pROC'


Die folgenden Objekte sind maskiert von 'package:stats':

    cov, smooth, var


Lade n"otiges Paket: ggplot2


Attache Paket: 

### Data

In [3]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)

### Helper functions

In [4]:
## tree fct with control component

build_tree <- function(data,trainsplit = 0.8, cp = 0.01, controls){ #minsplit=20, minbucket=5, maxdepth=20, 
    train_index <- sample(1:nrow(data), nrow(data)*trainsplit)
    # train dataset formation
    train_set <- data[train_index, ]
    # test dataset formation
    test_set <- data[-train_index, ]

    tree <- rpart(income ~ ., data = train_set, cp=cp, control = rpart.control(maxsurrogate = 0, maxcompete = 1))

    # Predict on the test set
    predictions <- predict(tree, test_set)#, type = "prob")

    # for factored variables this will give probabilities, so there is a need to create the actual predictions
    if (is.factor(data$income)){
        # Initialize predictions as just the probability predictions
        predictions_prob <- predictions
        predictions <- list(probabilities = predictions_prob)
        preds <- apply(predictions_prob, 1, function(row) {
            # Get the index of the max value in the row
            max_index <- which.max(row)
            # Return the column name using the index
            return(colnames(predictions_prob)[max_index])
        })
        # Add actual predictions to the predictions list
        predictions$classes <- as.factor(preds)
    }

    # plot the tree
    #rpart.plot(tree)

    # Results
    results <- list(train_set, test_set, tree, predictions)
    names(results) <- c("train_set", "test_set", "tree", "predictions")
    return(results)
}

In [5]:
## eval cont targets
evaluation_metrics_cont <- function(predictions, test_set){
  MAE <- mean(abs(predictions - test_set$income))
  MSE <- mean((predictions - test_set$income)^2)
  RMSE <- sqrt(MSE)

  SS_res <- sum((test_set$income - predictions)^2)
  SS_tot <- sum((test_set$income - mean(test_set$income))^2)
  R_squared <- 1 - (SS_res / SS_tot)

  MAPE <- mean(abs((test_set$income - predictions) / test_set$income)) * 100

  # Create the dataframe
  metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE,
        R_squared = R_squared, 
        MAPE = MAPE)

  return(metrics_df)
}

In [6]:
## eval factored targets
evaluation_metrics_factor <- function(predictions, test_set){
    # confusion matrix for the prediction on original data
    cm <- confusionMatrix(predictions, test_set$income,
                mode = "everything")

    # saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    f1 <- cm$byClass['F1']
    sens <- cm$byClass['Sensitivity']
    spec <- cm$byClass['Specificity']

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

In [33]:
## cross validation
cart_gridsearch <- function(data_train, data_val, predictions, complexity, tree, loss){
    ## needs to be created outside the function:
        ## create empty vector to safe trees
        #tree <- list()
        ## create vector to safe loss
        #loss <- rep(0, steps)
        ## create empty array to store cp values chosen
        #cp_val <- rep(0, nrun)
        ## create empty list to store evaluation dataframes
        #eval_list <- list()
        ## set the complexity parameters for trees
        # complexity <- 10^seq(log10(0.0001), log10(0.01), length.out = steps) # or change to preferences

    # optional parameters to prevent overfitting: minbucket, minsplit, maxdepth
    for (l in 1:length(complexity)){
        # create income prediction tree with train data
        tree[[l]] <- rpart(income ~ ., data = data_train, cp = complexity[l], control = rpart.control(maxsurrogate = 0, maxcompete = 1))
        # Predict on the validation set
        predictions <- predict(tree[[l]], data_val)

        # safe some loss information and sum over the k-fold loops
        if (is.numeric(data_train$income)) {
            # Mean Squared Error
            loss[l] <- loss[l] + mean((predictions - data_val$income)^2)
            }

        else if (is.factor(data_train$income)) {
        # Cross-Entropy Loss
            epsilon <- 1e-15  # to prevent log(0) which is undefined
            predicted_probs <- pmax(pmin(predictions[,2], 1 - epsilon), epsilon)
            n <- length(predicted_probs)
            loss[l] <- loss[l] + (-sum(as.numeric(data_val$income) * log(predicted_probs) + (1 - as.numeric(data_val$income)) * log(1 - predicted_probs)) / n)
            }

        else {
            break("The predicted target has to be numeric or factor.")
            }
        }
    return(loss)
}


## Just the CART prediction on original data

In [34]:
simulation <- function(data, nrun = 10, k_fold = 10, steps = 10){
    # create empty vector to safe trees
    tree <- list()
    # create vector to safe loss
    loss <- rep(0, steps)
    # create empty array to store cp values chosen
    cp_val <- rep(0, nrun)
    # create empty list to store evaluation dataframes
    eval_list <- list()

    # set the complexity parameters for trees
    complexity <- 10^seq(log10(0.0001), log10(0.01), length.out = steps)

    # for loss-calculation factored variables need to be converted to numeric
    if (is.factor(data$income)) {
        print("target is factor")
        data$income <- as.factor(as.numeric(data$income == ">50K"))
    }
    else {
        data$income <- as.numeric(data$income)
    }

    # set inital seed
    s <- 1234
    for (i in 1:nrun){
        # vary seed with each run
        s <- s + 1

        # Randomly split the data set into k-subsets (or k-fold)
        datalist <- split(data, sample(1:k_fold, nrow(data), replace=T)) #list of k same-sized elements that are slices of the data
        
        # leave-one-out CV for prediction
        for (j in 1:k_fold) {
            # split data in k folds
            data_val <- datalist[[j]]               # j-th of the k folds, validation set
            data_train <- bind_rows(datalist[-j])   #rest of the data without j-th of the k folds, training set
            loss <- cart_gridsearch(data_train, data_val, predictions, complexity, tree, loss)
            }
        # for which cp value was the loss the smallest
        min_loss <- which.min(loss)
        print(min_loss)
        cp_val[i] <- c(complexity[min_loss])

        tree_s <- build_tree(data = data, cp = cp_val[i])
    
        # evaluation metrics
        if (is.numeric(data$income)) {
            eval <- as.data.frame(evaluation_metrics_cont(tree_s$predictions, tree_s$test_set))
            }
        else if (is.factor(data$income)) {
            eval <- as.data.frame(evaluation_metrics_factor(tree_s$predictions$classes, tree_s$test_set))
            }
        else {
            break("The predicted target has to be numeric or factor.")
            }

        eval_list[[i]] <- eval
        print(c("run", i, "completed"))
        }

    # average over all runs
    sum_df <- Reduce(function(x, y) Map(`+`, x, y), eval_list)
    eval_avg <- lapply(sum_df, function(col) col / length(eval_list))

    # Convert the list back to a dataframe
    # Store row names
    rownames <- row.names(eval_list[[1]])

    # Convert the list back to a dataframe
    eval_avg <- as.data.frame(eval_avg)

    # Set back the row names
    row.names(eval_avg) <- rownames
    
    # returns
    results <- list(eval_avg = eval_avg,  cp_vals = cp_val)
    return(results)
}

In [35]:
cps_res <- simulation(data = cpspop, nrun = 10, k_fold = 10, steps = 10)

[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss calculated"
[1] "loss 

In [36]:
adult_res <- simulation(data = adult, nrun = 10, k_fold = 10, steps = 10)

[1] "target is factor"
[1] 1
[1] "run"       "1"         "completed"
[1] 1
[1] "run"       "2"         "completed"
[1] 1
[1] "run"       "3"         "completed"
[1] 1
[1] "run"       "4"         "completed"
[1] 1
[1] "run"       "5"         "completed"
[1] 1
[1] "run"       "6"         "completed"
[1] 1
[1] "run"       "7"         "completed"
[1] 1
[1] "run"       "8"         "completed"
[1] 1
[1] "run"       "9"         "completed"
[1] 1
[1] "run"       "10"        "completed"


In [37]:
cps_res$eval_avg

Unnamed: 0_level_0,MAE,MSE,RMSE,R_squared,MAPE
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,27515.01,1789078622,42294.83,0.253727,4627.156


In [38]:
adult_res$eval_avg

Unnamed: 0_level_0,Accuracy,F1,Sensitivity,Specificity
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
Accuracy,0.8447207,0.9018021,0.947803,0.5317472
