# MLP prediction function

### Packages

In [3]:
list_of_packages <- c ("synthpop", "insight", "party", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "randomForest", "caret", "xgboost", "data.table", "nnet")

lapply(list_of_packages, FUN= function(X){
  do.call("require", list(X))
})

Lade n"otiges Paket: synthpop

Find out more at https://www.synthpop.org.uk/

Lade n"otiges Paket: insight

Lade n"otiges Paket: party

Lade n"otiges Paket: grid

Lade n"otiges Paket: mvtnorm

Lade n"otiges Paket: modeltools

Lade n"otiges Paket: stats4

Lade n"otiges Paket: strucchange

Lade n"otiges Paket: zoo


Attache Paket: 'zoo'


Die folgenden Objekte sind maskiert von 'package:base':

    as.Date, as.Date.numeric


Lade n"otiges Paket: sandwich

Lade n"otiges Paket: dplyr


Attache Paket: 'dplyr'


Das folgende Objekt ist maskiert 'package:party':

    where


Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag


Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union


Lade n"otiges Paket: rpart

Lade n"otiges Paket: rpart.plot

Lade n"otiges Paket: randomForest

randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attache Paket: 'randomForest'


Das folgende Objekt ist maskiert 'package:dpl

### Data

In [4]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)

### Helper functions

In [5]:
## eval cont targets
evaluation_metrics_cont <- function(predictions, test_set){
  MAE <- mean(abs(predictions - test_set$income))
  MSE <- mean((predictions - test_set$income)^2)
  RMSE <- sqrt(MSE)

  SS_res <- sum((test_set$income - predictions)^2)
  SS_tot <- sum((test_set$income - mean(test_set$income))^2)
  R_squared <- 1 - (SS_res / SS_tot)

  MAPE <- mean(abs((test_set$income - predictions) / test_set$income)) * 100

  # Create the dataframe
  metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE,
        R_squared = R_squared, 
        MAPE = MAPE)

  return(metrics_df)
}

In [6]:
## eval factored targets
evaluation_metrics_factor <- function(predictions, test_set){
    # confusion matrix for the prediction on original data
    cm <- confusionMatrix(predictions, test_set$income,
                mode = "everything")

    # saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    f1 <- cm$byClass['F1']
    sens <- cm$byClass['Sensitivity']
    spec <- cm$byClass['Specificity']

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

# Just the MLP prediction

In [7]:
mlp_simulation <- function(data, nrun = 10, k_fold = 10, steps = 10){
  # Create empty list to store evaluation dataframes
  eval_list <- list()
  
  # Set initial seed
  s <- 1234
  
  for (i in 1:nrun){
    set.seed(s + i)
    
    # Define trainControl with k-fold cross-validation
    train_control <- trainControl(method = "cv", number = k_fold)
    
    # Define the grid for hyperparameter tuning
    size_values <- seq(1, 10, length.out = steps)
    decay_values <- 10^seq(log10(0.0001), log10(0.01), length.out = steps)
    
    # Create grid
    grid <- expand.grid(size = size_values, decay = decay_values)
    
    # Train the MLP model using caret with grid search
    mlp_model <- train(income ~ ., data = data, method = "nnet",
                       tuneGrid = grid, trControl = train_control, linout = TRUE, trace = FALSE)
    
    # Evaluation metrics
    predictions <- predict(mlp_model, newdata = data)
    
    if (is.numeric(data$income)) {
      eval <- as.data.frame(evaluation_metrics_cont(predictions, data$income))
    } else if (is.factor(data$income)) {
      eval <- as.data.frame(evaluation_metrics_factor(predictions, data$income))
    } else {
      stop("The predicted target has to be numeric or factor.")
    }
    
    eval_list[[i]] <- eval
    print(paste("run", i, "completed"))
  }
  
  # Average over all runs
  sum_df <- Reduce(function(x, y) Map(`+`, x, y), eval_list)
  eval_avg <- lapply(sum_df, function(col) col / length(eval_list))
  
  # Convert the list back to a dataframe
  # Store row names
  rownames <- row.names(eval_list[[1]])
  
  # Convert the list back to a dataframe
  eval_avg <- as.data.frame(eval_avg)
  
  # Set back the row names
  row.names(eval_avg) <- rownames
  
  # Returns
  results <- list(eval_avg = eval_avg)
  return(results)
}

# Example function for continuous evaluation metrics
evaluation_metrics_cont <- function(predictions, actuals) {
  mse <- mean((predictions - actuals)^2)
  rmse <- sqrt(mse)
  mae <- mean(abs(predictions - actuals))
  return(data.frame(MSE = mse, RMSE = rmse, MAE = mae))
}

# Example function for factor evaluation metrics
evaluation_metrics_factor <- function(predictions, actuals) {
  confusion <- confusionMatrix(predictions, actuals)
  accuracy <- confusion$overall['Accuracy']
  return(data.frame(Accuracy = accuracy))
}

In [8]:
cps_res <- mlp_simulation(cpspop)

In [None]:
adult_res <- mlp_simulation(adult)