# CART prediction function

### Packages

In [14]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

### Data

In [15]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)

### Helper functions

In [50]:
build_tree <- function(data_train, data_val, cp = 0.01) {
    # Train the decision tree model with control parameters
    tree <- rpart(income ~ ., data = data_train, cp = cp, 
                  control = rpart.control(maxsurrogate = 0, maxcompete = 1))

    # Predict on the validation set
    predictions <- predict(tree, data_val)

    # For factored variables, this will give probabilities, so create the actual predictions
    if (is.factor(data_train$income)) {
        # Initialize predictions as just the probability predictions
        predictions_prob <- predictions
        predictions <- list(probabilities = predictions_prob)
        preds <- apply(predictions_prob, 1, function(row) {
            # Get the index of the max value in the row
            max_index <- which.max(row)
            # Return the column name using the index
            return(colnames(predictions_prob)[max_index])
        })
        # Add actual predictions to the predictions list
        predictions$classes <- as.factor(preds)
    }

    # Results
    results <- list(train_set = data_train, test_set = data_val, tree = tree, predictions = predictions)
    return(results)
}

In [17]:
## eval cont targets
evaluation_metrics_cont <- function(predictions, test_set){
  MAE <- mean(abs(predictions - test_set$income))
  MSE <- mean((predictions - test_set$income)^2)
  RMSE <- sqrt(MSE)

  SS_res <- sum((test_set$income - predictions)^2)
  SS_tot <- sum((test_set$income - mean(test_set$income))^2)
  R_squared <- 1 - (SS_res / SS_tot)

  MAPE <- mean(abs((test_set$income - predictions) / test_set$income)) * 100

  # Create the dataframe
  metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE,
        R_squared = R_squared, 
        MAPE = MAPE)

  return(metrics_df)
}

In [18]:
## eval factored targets
evaluation_metrics_factor <- function(predictions, test_set){
    # confusion matrix for the prediction on original data
    cm <- confusionMatrix(predictions, test_set$income,
                mode = "everything")

    # saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    f1 <- cm$byClass['F1']
    sens <- cm$byClass['Sensitivity']
    spec <- cm$byClass['Specificity']

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

In [28]:
cart_gridsearch <- function(data_train, data_val, complexity) {
    # Initialize variables
    loss <- rep(0, length(complexity))
    tree <- list()
    
    # Loop through complexity parameters
    for (l in 1:length(complexity)) {
        # Create decision tree model
        tree[[l]] <- rpart(income ~ ., data = data_train, cp = complexity[l], control = rpart.control(maxsurrogate = 0, maxcompete = 1))
        # Predict on the validation set
        predictions <- predict(tree[[l]], data_val)
        
        # Calculate loss
        if (is.numeric(data_train$income)) {
            # Mean Squared Error for regression
            loss[l] <- mean((predictions - data_val$income)^2)
        } else if (is.factor(data_train$income)) {
            # Cross-Entropy Loss for classification
            epsilon <- 1e-15  # to prevent log(0) which is undefined
            predicted_probs <- pmax(pmin(predictions[,2], 1 - epsilon), epsilon)
            n <- length(predicted_probs)
            loss[l] <- -sum(as.numeric(data_val$income) * log(predicted_probs) + (1 - as.numeric(data_val$income)) * log(1 - predicted_probs)) / n
        } else {
            stop("The predicted target has to be numeric or factor.")
        }
    }
    return(loss)
}

## Just the CART prediction on original data

In [67]:
cart_prediction <- function(data, nrun = 10, outer_k_fold = 5, inner_k_fold = 5, steps = 10) {
    # Initialize arrays and lists to store results
    cp_val <- rep(0, nrun * outer_k_fold)
    eval_list <- list()
    complexity <- 10^seq(log10(0.0001), log10(0.01), length.out = steps)

    # Convert factor variables to numeric for loss calculation
    if (is.factor(data$income)) {
        data$income <- as.factor(as.numeric(data$income == ">50K"))
    } else {
        data$income <- as.numeric(data$income)
    }

    # Set initial seed
    set.seed(1234)
    
    for (i in 1:nrun) {
        # Outer loop: Randomly split the data set into outer_k subsets
        outer_datalist <- split(data, sample(1:outer_k_fold, nrow(data), replace = TRUE))

        outer_fold_eval_list <- list()

        for (j in 1:outer_k_fold) {
            # Split data into outer folds
            data_outer_test <- outer_datalist[[j]]
            data_outer_train <- bind_rows(outer_datalist[-j])

            inner_datalist <- split(data_outer_train, sample(1:inner_k_fold, nrow(data_outer_train), replace = TRUE))
            inner_loss <- rep(0, steps)

            for (k in 1:inner_k_fold) {
                data_inner_val <- inner_datalist[[k]]
                data_inner_train <- bind_rows(inner_datalist[-k])

                fold_loss <- cart_gridsearch(data_inner_train, data_inner_val, complexity)
                inner_loss <- inner_loss + fold_loss
            }

            min_inner_loss <- which.min(inner_loss)
            best_cp <- complexity[min_inner_loss]

            # Train the model on the entire outer training set with the best hyperparameter
            tree_s <- build_tree(data_train = data_outer_train, data_val = data_outer_test, cp = best_cp)

            # Extract predictions and test_set from tree_s
            predictions <- tree_s$predictions
            test_set <- tree_s$test_set

            # Calculate evaluation metrics on the outer test set
            if (is.numeric(data$income)) {
                eval <- as.data.frame(evaluation_metrics_cont(predictions, test_set))
                print("eval done")
            } else if (is.factor(data$income)) {
                eval <- as.data.frame(evaluation_metrics_factor(predictions$classes, test_set))
            } else {
                stop("The predicted target has to be numeric or factor.")
            }

            # Store the evaluation metrics for this outer fold
            outer_fold_eval_list[[j]] <- eval
            cp_val[(i - 1) * outer_k_fold + j] <- best_cp
        }

        # Average the evaluation metrics over the outer folds for this run
        sum_outer_fold_df <- Reduce(function(x, y) Map(`+`, x, y), outer_fold_eval_list)
        eval_avg_outer_fold <- lapply(sum_outer_fold_df, function(col) col / length(outer_fold_eval_list))

        # Store the average evaluation metrics for this run
        eval_list[[i]] <- eval_avg_outer_fold
        print(c("run", i, "completed"))
    }

    # Average the evaluation metrics over all runs
    sum_run_df <- Reduce(function(x, y) Map(`+`, x, y), eval_list)
    eval_avg_run <- lapply(sum_run_df, function(col) col / length(eval_list))

    # Convert the list back to a dataframe
    rownames <- row.names(eval_list[[1]])
    eval_avg_run <- as.data.frame(eval_avg_run)
    row.names(eval_avg_run) <- rownames

    # Returns
    results <- list(eval_avg = eval_avg_run, cp_vals = cp_val)
    return(results)
}


In [64]:
cps_res <- cart_prediction(data = cpspop, outer_k_fold = 2, inner_k_fold = 2)

[1] "eval done"
[1] "eval done"
[1] "run"       "1"         "completed"
[1] "eval done"
[1] "eval done"
[1] "run"       "2"         "completed"
[1] "eval done"
[1] "eval done"
[1] "run"       "3"         "completed"
[1] "eval done"
[1] "eval done"
[1] "run"       "4"         "completed"
[1] "eval done"
[1] "eval done"
[1] "run"       "5"         "completed"
[1] "eval done"
[1] "eval done"
[1] "run"       "6"         "completed"
[1] "eval done"
[1] "eval done"
[1] "run"       "7"         "completed"
[1] "eval done"
[1] "eval done"
[1] "run"       "8"         "completed"
[1] "eval done"
[1] "eval done"
[1] "run"       "9"         "completed"
[1] "eval done"
[1] "eval done"
[1] "run"       "10"        "completed"


In [68]:
adult_res <- cart_prediction(data = adult, outer_k_fold = 2, inner_k_fold = 2)

[1] "run"       "1"         "completed"
[1] "run"       "2"         "completed"
[1] "run"       "3"         "completed"
[1] "run"       "4"         "completed"
[1] "run"       "5"         "completed"
[1] "run"       "6"         "completed"
[1] "run"       "7"         "completed"
[1] "run"       "8"         "completed"
[1] "run"       "9"         "completed"
[1] "run"       "10"        "completed"


In [37]:
cps_res$eval_avg

Unnamed: 0_level_0,MAE,MSE,RMSE,R_squared,MAPE
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,27515.01,1789078622,42294.83,0.253727,4627.156


In [38]:
adult_res$eval_avg

Unnamed: 0_level_0,Accuracy,F1,Sensitivity,Specificity
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
Accuracy,0.8447207,0.9018021,0.947803,0.5317472


### Save the results

In [None]:
# Bind results
cart_pred_results <- list(cps_res = cps_res, adult_res = adult_res)
# File pth for output
file <- "/user/emma.foessing01/u11969/results/cart_pred_results.RData" 
dir.create(dirname(output_file), recursive = TRUE, showWarnings = FALSE) # create dir if not there
# Save the results to an RData file 
save(results, file = output_file)