## Packages

In [None]:
# Set the library path
#.libPaths("/user/emma.foessing01/u11969/new_R_libs")
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

print(R.version.string)

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "doParallel", "parallel", "ExtDist", "e1071"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

## Data

In [None]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("income", "race", "marital", "educ")), "income", "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]
adult[] <- lapply(adult, function(col) {
  if (is.integer(col)) {
    as.numeric(col)
  } else {
    col
  }
})

## Eval Functions

In [None]:
## Calculate evaluation metrics for continuous targets
evaluation_metrics_cont <- function(predictions, test_set){
    # Residuals
    residuals <- predictions - test_set$income
    
    # Mean Absolute Error (MAE)
    MAE <- mean(abs(residuals))
    
    # Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
    MSE <- mean(residuals^2)
    RMSE <- sqrt(MSE)
    
    # R-squared: Guarding against zero variance in the target
    SS_res <- sum(residuals^2)
    SS_tot <- sum((test_set$income - mean(test_set$income))^2)
    R_squared <- ifelse(SS_tot == 0, NA, 1 - (SS_res / SS_tot))
    
    # Mean Absolute Percentage Error (MAPE): Handling division by zero
    MAPE <- ifelse(any(test_set$income == 0), NA, mean(abs(residuals / test_set$income)) * 100)
    
    metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE, 
        R_squared = R_squared, 
        MAPE = MAPE
    )
    
    return(metrics_df)
}
## Calculate evaluation metrics for factored targets
evaluation_metrics_factor <- function(predictions, test_set) {
    # Ensure test_set is a data frame
    test_set <- as.data.frame(test_set)
    
    # Ensure both predictions and test_set$income are factors with the same levels
    predictions <- as.factor(predictions)
    reference <- as.factor(test_set$income)
    
    # Ensure levels match between predictions and reference
    levels(predictions) <- levels(reference)
    
    # Confusion matrix for the prediction on original data
    cm <- caret::confusionMatrix(predictions, reference, mode = "everything")

    # Saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    
    if (length(levels(reference)) == 2) {
        # Binary classification
        f1 <- cm$byClass['F1']
        sens <- cm$byClass['Sensitivity']
        spec <- cm$byClass['Specificity']
    } else {
        # Multi-class classification: calculate metrics for each class and take the mean
        f1 <- mean(cm$byClass[,'F1'], na.rm = TRUE)
        sens <- mean(cm$byClass[,'Sensitivity'], na.rm = TRUE)
        spec <- mean(cm$byClass[,'Specificity'], na.rm = TRUE)
    }

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

## SVM pred Function

In [None]:
svm_pred <- function(data, outer_folds, cost_steps, inner_folds) {
    # Ensure factor levels
    data$workclass <- as.factor(data$workclass)
    data$education <- as.factor(data$education)
    data$marital_status <- as.factor(data$marital_status)
    data$relationship <- as.factor(data$relationship)
    data$race <- as.factor(data$race)
    data$sex <- as.factor(data$sex)
    data$native_country <- as.factor(data$native_country)
    data$income <- as.factor(data$income)
    data$occupation <- as.factor(data$occupation)

    # Adjust evaluation metric to fit both numeric and factored targets
    summaryFunctionType <- if (is.numeric(data$income)) defaultSummary else multiClassSummary

    # Set control args
    outer_control <- trainControl(method = "cv", number = outer_folds,
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)
    
    inner_control <- trainControl(method = "cv", number = inner_folds, 
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)

    # Define the grid for hyperparameter tuning
    cost_values <- 10^seq(log10(0.001), log10(100), length.out = cost_steps)  # Adjust the range as needed
    tunegrid <- expand.grid(C = cost_values, sigma = 0.1)  # sigma can also be tuned separately

    # Create outer CV folds
    outer_cv_folds <- createFolds(data$income, k = outer_folds)

    # Setup parallel backend for outer folds
    num_cores <- detectCores() - 1  # Use all but one core for parallel processing
    cl <- makeCluster(num_cores)
    registerDoParallel(cl)

    # Parallelized outer loop: Cross-validation for model evaluation
    outer_results <- foreach(i = seq_along(outer_cv_folds), .combine = rbind, .packages = c("caret", "dplyr"),
                             .export = c("evaluation_metrics_cont", "evaluation_metrics_factor")) %dopar% {

        # Split data into outer folds
        outer_test_index <- outer_cv_folds[[i]]
        outer_testData <- data[outer_test_index, ]
        outer_trainData <- data[-outer_test_index, ]

        # Hyperparameter tuning using inner CV
        model <- caret::train(income ~ ., 
                              data = outer_trainData, 
                              method = "svmRadial",  
                              tuneGrid = tunegrid, 
                              trControl = inner_control)

        # Store the best hyperparameters
        best_hyperparameters <- model$bestTune

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- caret::train(income ~ ., 
                                    data = outer_trainData, 
                                    method = "svmRadial", 
                                    trControl = outer_control, 
                                    tuneGrid = best_hyperparameters)

        # Testing the final model on the outer test set
        predictions <- predict(final_model, newdata = outer_testData)

        if (is.numeric(data$income)) {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        } else if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        # Return the evaluation metrics for this outer fold
        return(eval)
    }

    stopCluster(cl)  # Stop the cluster when done
    registerDoSEQ()

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_folds <- outer_results %>%
                            summarise(across(everything(), mean, na.rm = TRUE))

    # Return the average evaluation metrics
    return(eval_avg_outer_folds)
}



## Apply

In [None]:
s <- 1237
syndata <- readRDS(paste0(here(), "/results/", "adult", "_svm_", as.character(s), ".rds"))

In [None]:
svm_eval <- svm_pred(syndata, 5, 10, 3)

In [None]:
# if it ran through, save svm eval
save(svm_eval, file = paste0(here(), "/results/adult_svm_eval_", as.character(s) ,".RData"))

In [None]:
svm_eval