## Packages

In [None]:
# Set the library path
#.libPaths("/user/emma.foessing01/u11969/new_R_libs")
Sys.setenv("PKG_CXXFLAGS"="-std=c++14")

print(R.version.string)

# List of required packages
list_of_packages <- c(
  "synthpop", "jsonlite", "codetools", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot",
  "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr",
  "ranger", "bnlearn", "arulesCBA", "network", "igraph", "xgboost",
  "data.table", "doParallel", "parallel", "ExtDist", "e1071"
)

# Function to load packages and handle errors
load_if_installed <- function(p) {
  tryCatch({
    library(p, character.only = TRUE)
  }, error = function(e) {
    message(sprintf("Package '%s' is not installed.", p))
  })
}

# Load all required packages
lapply(list_of_packages, load_if_installed)

## Data

In [None]:
load(file = (paste0(here(), "/cpspop.RData")))
cpspop <- cpspop[, c(setdiff(names(cpspop), c("income", "race", "marital", "educ")), "income", "race", "marital", "educ")] #

adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

adult <- adult[, c("age", "fnlwgt", "capital_gain", "capital_loss", "hours_per_week", "income", "sex", "race", "relationship", "marital_status", "workclass", "occupation", "education", "native_country")]
adult[] <- lapply(adult, function(col) {
  if (is.integer(col)) {
    as.numeric(col)
  } else {
    col
  }
})

## Eval Functions

In [None]:
## Calculate evaluation metrics for continuous targets
evaluation_metrics_cont <- function(predictions, test_set){
    # Residuals
    residuals <- predictions - test_set$income
    
    # Mean Absolute Error (MAE)
    MAE <- mean(abs(residuals))
    
    # Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
    MSE <- mean(residuals^2)
    RMSE <- sqrt(MSE)
    
    # R-squared: Guarding against zero variance in the target
    SS_res <- sum(residuals^2)
    SS_tot <- sum((test_set$income - mean(test_set$income))^2)
    R_squared <- ifelse(SS_tot == 0, NA, 1 - (SS_res / SS_tot))
    
    # Mean Absolute Percentage Error (MAPE): Handling division by zero
    MAPE <- ifelse(any(test_set$income == 0), NA, mean(abs(residuals / test_set$income)) * 100)
    
    metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE, 
        R_squared = R_squared, 
        MAPE = MAPE
    )
    
    return(metrics_df)
}
## Calculate evaluation metrics for factored targets
evaluation_metrics_factor <- function(predictions, test_set) {
    # Ensure test_set is a data frame
    test_set <- as.data.frame(test_set)
    
    # Ensure both predictions and test_set$income are factors with the same levels
    predictions <- as.factor(predictions)
    reference <- as.factor(test_set$income)
    
    # Ensure levels match between predictions and reference
    levels(predictions) <- levels(reference)
    
    # Confusion matrix for the prediction on original data
    cm <- caret::confusionMatrix(predictions, reference, mode = "everything")

    # Saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    
    if (length(levels(reference)) == 2) {
        # Binary classification
        f1 <- cm$byClass['F1']
        sens <- cm$byClass['Sensitivity']
        spec <- cm$byClass['Specificity']
    } else {
        # Multi-class classification: calculate metrics for each class and take the mean
        f1 <- mean(cm$byClass[,'F1'], na.rm = TRUE)
        sens <- mean(cm$byClass[,'Sensitivity'], na.rm = TRUE)
        spec <- mean(cm$byClass[,'Specificity'], na.rm = TRUE)
    }

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

## SVM pred Function

In [None]:
svm_pred <- function(data, outer_folds, cost_steps, inner_folds) {
    # Rename the levels of the 'income' variable to valid R variable names
    data$income <- as.factor(data$income)
    levels(data$income) <- c("LessThan50K", "GreaterThan50K")
    
    # Use twoClassSummary for binary classification
    summaryFunctionType <- twoClassSummary

    # Set control args
    outer_control <- trainControl(
        method = "none", #no extra cross validation, nested CV manually
        summaryFunction = summaryFunctionType,
        classProbs = TRUE,
        verboseIter = FALSE,
        allowParallel = FALSE
    )

    inner_control <- trainControl(
        method = "cv",
        number = inner_folds,
        summaryFunction = summaryFunctionType,
        classProbs = TRUE,
        verboseIter = FALSE,
        allowParallel = FALSE#,
        #sampling = "up"  # Handle class imbalance
    )

    # Define the grid for hyperparameter tuning
    cost_values <- 10^seq(log10(0.001), log10(100), length.out = cost_steps)
    tunegrid <- expand.grid(C = cost_values, sigma = 0.1)

    # Create stratified outer CV folds
    outer_cv_folds <- createFolds(data$income, k = outer_folds, returnTrain = FALSE, list = TRUE)

    # Variable to record failed hyperparameter combinations
    failed_combinations <- list()

    # Sequential outer loop: Cross-validation for model evaluation
    outer_results <- lapply(seq_along(outer_cv_folds), function(i) {
        tryCatch({
            # Split data into outer folds
            outer_test_index <- outer_cv_folds[[i]]
            outer_testData <- data[outer_test_index, ]
            outer_trainData <- data[-outer_test_index, ]

            # Diagnostic messages
            cat("\nFold", i, ":\n")
            cat("Number of observations in training data:", nrow(outer_trainData), "\n")
            cat("Number of observations in test data:", nrow(outer_testData), "\n")

            # Exclude the target variable from the constant columns check
            predictor_columns <- setdiff(names(outer_trainData), "income")
            constant_columns <- sapply(outer_trainData[, predictor_columns], function(x) length(unique(x)) == 1)
            cat("Number of constant predictors (excluding target):", sum(constant_columns), "\n")

            if (any(constant_columns)) {
                outer_trainData <- outer_trainData[, c("income", predictor_columns[!constant_columns])]
                outer_testData <- outer_testData[, c("income", predictor_columns[!constant_columns])]
                warning("Constant columns removed before model fitting.")
            }

            cat("Number of predictors after removing constants:", length(predictor_columns[!constant_columns]), "\n")

            # Ensure 'income' is a factor and levels are valid R variable names
            outer_trainData$income <- as.factor(outer_trainData$income)
            outer_testData$income <- as.factor(outer_testData$income)
            levels(outer_trainData$income) <- make.names(levels(outer_trainData$income))
            levels(outer_testData$income) <- levels(outer_trainData$income)  # Ensure consistency

            # Print the levels of the target variable
            cat("Levels of target variable 'income' in training data:\n")
            print(levels(outer_trainData$income))

            # Drop unused factor levels in training data
            outer_trainData <- droplevels(outer_trainData)

            # Align factor levels in test data to match those in training data
            factor_vars <- names(outer_trainData)[sapply(outer_trainData, is.factor)]
            factor_vars <- setdiff(factor_vars, "income")  # Exclude the target variable
            for (var in factor_vars) {
                outer_testData[[var]] <- factor(outer_testData[[var]], levels = levels(outer_trainData[[var]]))
            }

            # Check class distribution
            cat("Class distribution in training data:\n")
            print(table(outer_trainData$income))

            # Create dummy variables for predictor variables
            predictor_vars <- predictor_columns[!constant_columns]
            predictor_vars <- setdiff(predictor_vars, "income")  # Ensure 'income' is excluded

            dummies <- dummyVars(~ ., data = outer_trainData[, predictor_vars], fullRank = TRUE)
            outer_trainData_dum <- data.frame(predict(dummies, newdata = outer_trainData[, predictor_vars]))
            outer_trainData_dum$income <- outer_trainData$income

            outer_testData_dum <- data.frame(predict(dummies, newdata = outer_testData[, predictor_vars]))
            outer_testData_dum$income <- outer_testData$income

            # Remove zero variance predictors
            nzv <- nearZeroVar(outer_trainData_dum)
            if(length(nzv) > 0) {
                # Record names of variables to be removed
                nzv_vars <- names(outer_trainData_dum)[nzv]
                cat("Removing zero variance predictors:\n")
                print(nzv_vars)
                outer_trainData_dum <- outer_trainData_dum[ , -nzv, drop=FALSE]
                outer_testData_dum <- outer_testData_dum[ , -nzv, drop=FALSE]
            }

            # Ensure that the predictors in train and test data match
            if(!identical(names(outer_trainData_dum), names(outer_testData_dum))) {
                stop("Predictor names in training and test data do not match after preprocessing.")
            }

            # Check for variables with empty names after preprocessing
            if (any(names(outer_trainData_dum) == "")) {
                stop("There are variables with empty names after preprocessing.")
            }

            # Ensure 'income' remains a factor with valid levels
            outer_trainData_dum$income <- as.factor(outer_trainData_dum$income)
            outer_testData_dum$income <- as.factor(outer_testData_dum$income)
            levels(outer_trainData_dum$income) <- make.names(levels(outer_trainData_dum$income))
            levels(outer_testData_dum$income) <- levels(outer_trainData_dum$income)

            # Hyperparameter tuning using inner CV
            model <- caret::train(
                income ~ .,
                data = outer_trainData_dum,
                method = "svmRadial",
                tuneGrid = tunegrid,
                trControl = inner_control#,
                #metric = "ROC"
                #preProcess = c("center", "scale")
            )

            # Store the best hyperparameters
            best_hyperparameters <- model$bestTune

            # Train the final model on the outer training set with the best hyperparameters
            final_model <- caret::train(
                income ~ .,
                data = outer_trainData_dum,
                method = "svmRadial",
                trControl = outer_control,
                tuneGrid = best_hyperparameters#,
                #metric = "ROC"
                #preProcess = c("center", "scale")
            )

            # Testing the final model on the outer test set
            predictions <- predict(final_model, newdata = outer_testData_dum)

            # Evaluate the model
            eval <- evaluation_metrics_factor(predictions, outer_testData_dum)

            # Return the evaluation metrics for this outer fold
            return(eval)
        }, error = function(e) {
            # Record the failed hyperparameter combination
            failed_combinations[[length(failed_combinations) + 1]] <<- list(
                fold = i,
                C = tunegrid$C,
                sigma = tunegrid$sigma,
                error = e$message
            )
            cat("Error in fold ", i, " for hyperparameters C =", tunegrid$C, "and sigma =", tunegrid$sigma, "\n")
            message("Error in fold ", i, ": ", e$message)
            return(NULL)  # Returning NULL for failed folds
        })
    })

    # Filter out any failed folds (NULL values)
    outer_results <- outer_results[!sapply(outer_results, is.null)]

    if (length(outer_results) == 0) {
        stop("No valid results from cross-validation. All folds failed.")
    }

    # Convert list of results to a dataframe
    outer_results_df <- do.call(rbind, outer_results)

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_folds <- outer_results_df %>%
                            summarise(across(everything(), ~ mean(.x, na.rm = TRUE)))

    # Print failed combinations
    if (length(failed_combinations) > 0) {
        cat("Failed hyperparameter combinations:\n")
        print(failed_combinations)
    }

    # Return the average evaluation metrics
    return(eval_avg_outer_folds)
}


## Apply

In [None]:
s <- 1235
syndata <- readRDS(paste0(here(), "/results/", "adult", "_svm_", as.character(s), ".rds"))

In [None]:
svm_eval <- svm_pred(syndata, 5, 10, 3)

In [None]:
# if it ran through, save svm eval
save(svm_eval, file = paste0(here(), "/results/adult_svm_eval_", as.character(s) ,".RData"))

In [None]:
svm_eval