# Boosting prediction function


In [23]:
install.packages("xgboost")


Die heruntergeladenen Bin"arpakete sind in 
	/var/folders/kj/dkjqkk2n3wq2zfbttgdpjrj80000gn/T//Rtmptupr5b/downloaded_packages


In [24]:
list_of_packages <- c ("synthpop", "insight", "party", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "randomForest", "caret", "xgboost", "data.table")

lapply(list_of_packages, FUN= function(X){
  do.call("require", list(X))
})

Lade n"otiges Paket: xgboost




Attache Paket: 'xgboost'


Das folgende Objekt ist maskiert 'package:dplyr':

    slice




## Data

In [11]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)

In [12]:
head(adult)

Unnamed: 0_level_0,age,workclass,fnlwgt,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
Unnamed: 0_level_1,<int>,<fct>,<int>,<fct>,<fct>,<int>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<fct>,<fct>
1,82,2,132870,11,6,3,1,4,0,0,4356,18,38,<=50K
2,54,2,140359,5,0,6,4,4,0,0,3900,40,38,<=50K
3,41,2,264663,15,5,9,3,4,0,0,3900,40,38,<=50K
4,34,2,216864,11,0,7,4,4,0,0,3770,45,38,<=50K
5,38,2,150601,0,5,0,4,4,1,0,3770,40,38,<=50K
6,74,5,88638,10,4,9,2,4,0,0,3683,20,38,>50K


### Helper functions

In [13]:
## eval cont targets
evaluation_metrics_cont <- function(predictions, test_set){
  MAE <- mean(abs(predictions - test_set$income))
  MSE <- mean((predictions - test_set$income)^2)
  RMSE <- sqrt(MSE)

  SS_res <- sum((test_set$income - predictions)^2)
  SS_tot <- sum((test_set$income - mean(test_set$income))^2)
  R_squared <- 1 - (SS_res / SS_tot)

  MAPE <- mean(abs((test_set$income - predictions) / test_set$income)) * 100

  # Create the dataframe
  metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE,
        R_squared = R_squared, 
        MAPE = MAPE)

  return(metrics_df)
}

In [14]:
## eval factored targets
evaluation_metrics_factor <- function(predictions, test_set){
    # confusion matrix for the prediction on original data
    cm <- confusionMatrix(predictions, test_set$income,
                mode = "everything")

    # saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    f1 <- cm$byClass['F1']
    sens <- cm$byClass['Sensitivity']
    spec <- cm$byClass['Specificity']

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

In [15]:
convert_to_numeric_matrix <- function(df) {
  # Convert factors to numeric
  df[] <- lapply(df, function(col) {
    if (is.factor(col)) {
      return(as.numeric(col))
    } else {
      return(col)
    }
  })
  
  # Convert the entire data frame to a matrix
  matrix_data <- as.matrix(df)
  return(matrix_data)
}

In [29]:
simulation_xgboost <- function(data, nrun = 10, k_fold = 10, steps = 10){
  # Create empty vector to store loss
  loss <- rep(0, steps)
  # Create empty array to store chosen parameters
  best_params_list <- list()
  # Create empty list to store evaluation dataframes
  eval_list <- list()
  
  # Set the parameters grid for XGBoost
  eta_values <- seq(0.01, 0.3, length.out = steps)
  max_depth_values <- round(seq(3, 9, length.out = steps))
  
  # For loss-calculation factored variables need to be converted to numeric
  if (is.factor(data$income)) {
    print("Target is factor")
    data$income <- as.numeric(as.factor(data$income))
  } else {
    data$income <- as.numeric(data$income)
  }
  
  # Set initial seed
  s <- 1234
  for (i in 1:nrun){
    # Vary seed with each run
    s <- s + 1
    set.seed(s)
    
    # Randomly split the data set into k subsets (or k folds)
    datalist <- split(data, sample(1:k_fold, nrow(data), replace = TRUE))
    
    # Perform k-fold cross-validation
    for (j in 1:k_fold) {
      # Split data in k folds
      data_val <- datalist[[j]]               # j-th of the k folds, validation set
      data_train <- do.call(rbind, datalist[-j])   # Rest of the data without j-th of the k folds, training set
      
      train_X <- convert_to_numeric_matrix(data_train[, !colnames(data_train) %in% 'income'])
      train_y <- data_train$income
      
      val_X <- convert_to_numeric_matrix(data_val[, !colnames(data_val) %in% 'income'])
      val_y <- data_val$income
      
      train_dmatrix <- xgb.DMatrix(data = train_X, label = train_y)
      val_dmatrix <- xgb.DMatrix(data = val_X, label = val_y)
      
      for (eta in eta_values) {
        for (max_depth in max_depth_values) {
          params <- list(
            objective = if (is.factor(data$income)) "multi:softprob" else "reg:squarederror",
            eta = eta,
            max_depth = max_depth
          )
          if (is.factor(data$income)) {
            params$num_class <- length(unique(data$income))
          }
          
          model <- xgb.train(params = params, data = train_dmatrix, nrounds = 100, verbose = 0)
          predictions <- predict(model, val_dmatrix)
          
          if (is.factor(data$income)) {
            epsilon <- 1e-15
            predicted_probs <- pmax(pmin(predictions, 1 - epsilon), epsilon)
            n <- length(predicted_probs)
            loss[j] <- -sum(as.numeric(val_y) * log(predicted_probs) + (1 - as.numeric(val_y)) * log(1 - predicted_probs)) / n
          } else {
            loss[j] <- mean((predictions - val_y)^2)
          }
        }
      }
    }
    
    mean_loss <- mean(loss)
    
    best_eta <- eta_values[which.min(loss)]
    best_max_depth <- max_depth_values[which.min(loss)]
    
    best_params_list[[i]] <- list(eta = best_eta, max_depth = best_max_depth)
    
    final_params <- list(
      objective = if (is.factor(data$income)) "multi:softprob" else "reg:squarederror",
      eta = best_eta,
      max_depth = best_max_depth,
      num_class = if (is.factor(data$income)) length(unique(data$income)) else NULL
    )
    
    final_model <- xgb.train(params = final_params, data = xgb.DMatrix(data = convert_to_numeric_matrix(data[, !colnames(data) %in% 'income']), label = data$income), nrounds = 100, verbose = 0)
    final_predictions <- predict(final_model, xgb.DMatrix(data = convert_to_numeric_matrix(data[, !colnames(data) %in% 'income'])))
    
    # Evaluation metrics
    if (is.factor(data$income)) {
      eval <- as.data.frame(evaluation_metrics_factor(final_predictions, data$income))
    } else {
      eval <- as.data.frame(evaluation_metrics_cont(final_predictions, data$income))
    }
    
    eval_list[[i]] <- eval
    print(c("Run", i, "completed"))
  }
  
  # Average over all runs
  sum_df <- Reduce(function(x, y) Map(`+`, x, y), eval_list)
  eval_avg <- lapply(sum_df, function(col) col / length(eval_list))
  
  # Convert the list back to a dataframe
  # Store row names
  rownames <- row.names(eval_list[[1]])
  
  # Convert the list back to a dataframe
  eval_avg <- as.data.frame(eval_avg)
  
  # Set back the row names
  row.names(eval_avg) <- rownames
  
  # Returns
  results <- list(eval_avg = eval_avg, best_params_list = best_params_list)
  return(results)
}

In [30]:
cps_res <- simulation_xgboost(cpspop)

In [None]:
adult_res <- simulation(adult)

### Save results

In [None]:
# Bind results
boosting_pred_results <- list(cps_res = cps_res, adult_res = adult_res)
# File pth for output
file <- "/user/emma.foessing01/u11969/results/boosting_pred_results.RData" 
dir.create(dirname(output_file), recursive = TRUE, showWarnings = FALSE) # create dir if not there
# Save the results to an RData file 
save(results, file = output_file)