# MLP prediction function

### Packages

In [1]:
list_of_packages <- c ("synthpop", "insight", "party", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "randomForest",  "xgboost", "data.table", "RSNNS", "nnet")

# Function to check and install packages
install_if_missing <- function(p) {
  if (!requireNamespace(p, quietly = TRUE)) {
    install.packages(p, dependencies = TRUE)
  }
  library(p, character.only = TRUE)
}

# Install and load all required packages
lapply(list_of_packages, install_if_missing)

Find out more at https://www.synthpop.org.uk/

Lade n"otiges Paket: grid

Lade n"otiges Paket: mvtnorm

Lade n"otiges Paket: modeltools

Lade n"otiges Paket: stats4

Lade n"otiges Paket: strucchange

Lade n"otiges Paket: zoo


Attache Paket: 'zoo'


Die folgenden Objekte sind maskiert von 'package:base':

    as.Date, as.Date.numeric


Lade n"otiges Paket: sandwich


Attache Paket: 'dplyr'


Das folgende Objekt ist maskiert 'package:party':

    where


Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag


Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union


randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attache Paket: 'randomForest'


Das folgende Objekt ist maskiert 'package:dplyr':

    combine


Type 'citation("pROC")' for a citation.


Attache Paket: 'pROC'


Die folgenden Objekte sind maskiert von 'package:stats':

    cov, smooth, var


Lade n"otiges Paket: ggplot2


Attache Paket: 

### Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

### Helper functions

In [3]:
## eval cont targets
evaluation_metrics_cont <- function(predictions, test_set){
  MAE <- mean(abs(predictions - test_set$income))
  MSE <- mean((predictions - test_set$income)^2)
  RMSE <- sqrt(MSE)

  SS_res <- sum((test_set$income - predictions)^2)
  SS_tot <- sum((test_set$income - mean(test_set$income))^2)
  R_squared <- 1 - (SS_res / SS_tot)

  MAPE <- mean(abs((test_set$income - predictions) / test_set$income)) * 100

  # Create the dataframe
  metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE,
        R_squared = R_squared, 
        MAPE = MAPE)

  return(metrics_df)
}

In [4]:
## eval factored targets
evaluation_metrics_factor <- function(predictions, test_set){
    # confusion matrix for the prediction on original data
    cm <- confusionMatrix(predictions, test_set$income,
                mode = "everything")

    # saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    f1 <- cm$byClass['F1']
    sens <- cm$byClass['Sensitivity']
    spec <- cm$byClass['Specificity']

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

In [5]:
mlp_pred <- function(data, outer_folds, size_steps, decay_steps, inner_folds){
    # adjust evaluation metric to fit both numeric and factored targets
    summaryFunctionType <- if (is.numeric(data$income)) defaultSummary else multiClassSummary
    # metric: train() uses per default RSME and Accuracy for numeric and factored targets

    #  set control args
    outer_control <- trainControl(method = "cv", number = outer_folds,
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)
        
    inner_control <- trainControl(method = "cv", number = inner_folds, 
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)

    # Define the grid for hyperparameter tuning
    size_values <- seq(1, 10, length.out = size_steps)
    decay_values <- 10^seq(log10(0.0001), log10(0.01), length.out = decay_steps)

    # Create grid
    tunegrid <- expand.grid(size = size_values, decay = decay_values)

    # Initialize variables to store results
    outer_results <- list()

    outer_cv_folds = createFolds(data$income, k = outer_folds)
    
    # Outer loop: Cross-validation for model evaluation
    for (i in seq_along(outer_folds)) {
        
        # Split data into outer folds
        outer_test_index = outer_cv_folds[[i]]
        outer_testData = data[outer_test_index,]
        outer_trainData  = data[-outer_test_index,]
        
        # Hyperparameter tuning using inner CV
        # No need for inner loop because "train" does k-fold CV already
        mlp_model <- caret::train(income ~ ., 
                           data = outer_trainData, 
                           method = "nnet", 
                           tuneGrid = tunegrid, 
                           trControl = inner_control)#,
                           #metric = metricType)
            

        # Store the best hyperparameters
        best_hyperparameters <- mlp_model$bestTune
        print("best HP")
        print(mlp_model$bestTune)

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- caret::train(income ~ ., 
                             data = outer_trainData, 
                             method = "nnet", 
                             trControl = outer_control, 
                             tuneGrid = best_hyperparameters)

        # Testing the final model on the outer test set
        predictions <- predict(final_model, newdata = outer_testData)
        
        if (is.numeric(data$income)) {
            eval <- evaluation_metrics_cont(predictions, outer_testData)
        } else if (is.factor(data$income)) {
            eval <- evaluation_metrics_factor(predictions, outer_testData)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        # Store the evaluation metrics for this outer fold
        outer_results[[i]] <- eval
    }

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_folds <- do.call(rbind, outer_results) %>%
                            summarise(across(everything(), mean, na.rm = TRUE))

    

    # Return the average evaluation metrics
    return(eval_avg_outer_folds)
}

In [6]:
res <- mlp_pred(data = adult, outer_folds = 5, size_steps = 10, decay_steps = 10, inner_folds = 5)

# weights:  86
initial  value 17835.609847 
final  value 10832.187275 
converged
# weights:  171
initial  value 13579.916589 
iter  10 value 10694.931363
final  value 10694.873243 
converged
# weights:  256
initial  value 18828.392069 
final  value 10832.189731 
converged
# weights:  341
initial  value 18384.463150 
iter  10 value 10715.580988
final  value 10705.936230 
converged
# weights:  426
initial  value 12180.498043 
iter  10 value 10697.106099
final  value 10696.143318 
converged
# weights:  511
initial  value 12088.117689 
iter  10 value 10773.765994
final  value 10773.747399 
converged
# weights:  596
initial  value 10842.529966 
iter  10 value 10737.825367
final  value 10737.438562 
converged
# weights:  681
initial  value 10659.195273 
iter  10 value 10250.187379
iter  20 value 9679.335737
iter  30 value 9014.297759
iter  40 value 8843.625060
iter  50 value 8357.883816
iter  60 value 8196.014430
iter  70 value 8174.713983
iter  80 value 8165.255274
iter  90 value 8151.70028

### Save the results

In [None]:
# Bind results
mlp_pred_results <- list(cps_res = cps_res, adult_res = adult_res)
# Save the results to an RData file 
save(mlp_pred_results, file = paste0(dirname, "/results/mlp_pred_results.RData"))