# MLP prediction function

### Packages

In [1]:
list_of_packages <- c ("synthpop", "insight", "party", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "randomForest",  "xgboost", "data.table", "RSNNS", "nnet")

# Function to check and install packages
install_if_missing <- function(p) {
  if (!requireNamespace(p, quietly = TRUE)) {
    install.packages(p, dependencies = TRUE)
  }
  library(p, character.only = TRUE)
}

# Install and load all required packages
lapply(list_of_packages, install_if_missing)

Find out more at https://www.synthpop.org.uk/

Lade n"otiges Paket: grid

Lade n"otiges Paket: mvtnorm

Lade n"otiges Paket: modeltools

Lade n"otiges Paket: stats4

Lade n"otiges Paket: strucchange

Lade n"otiges Paket: zoo


Attache Paket: 'zoo'


Die folgenden Objekte sind maskiert von 'package:base':

    as.Date, as.Date.numeric


Lade n"otiges Paket: sandwich


Attache Paket: 'dplyr'


Das folgende Objekt ist maskiert 'package:party':

    where


Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag


Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union


randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attache Paket: 'randomForest'


Das folgende Objekt ist maskiert 'package:dplyr':

    combine


Type 'citation("pROC")' for a citation.


Attache Paket: 'pROC'


Die folgenden Objekte sind maskiert von 'package:stats':

    cov, smooth, var


Lade n"otiges Paket: ggplot2


Attache Paket: 

### Data

In [2]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)

### Helper functions

In [3]:
## eval cont targets
evaluation_metrics_cont <- function(predictions, test_set){
  MAE <- mean(abs(predictions - test_set$income))
  MSE <- mean((predictions - test_set$income)^2)
  RMSE <- sqrt(MSE)

  SS_res <- sum((test_set$income - predictions)^2)
  SS_tot <- sum((test_set$income - mean(test_set$income))^2)
  R_squared <- 1 - (SS_res / SS_tot)

  MAPE <- mean(abs((test_set$income - predictions) / test_set$income)) * 100

  # Create the dataframe
  metrics_df <- data.frame(
        MAE = MAE, 
        MSE = MSE, 
        RMSE = RMSE,
        R_squared = R_squared, 
        MAPE = MAPE)

  return(metrics_df)
}

In [4]:
## eval factored targets
evaluation_metrics_factor <- function(predictions, test_set){
    # confusion matrix for the prediction on original data
    cm <- confusionMatrix(predictions, test_set$income,
                mode = "everything")

    # saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    f1 <- cm$byClass['F1']
    sens <- cm$byClass['Sensitivity']
    spec <- cm$byClass['Specificity']

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

In [5]:
mlp_pred <- function(data, outer_folds, size_steps, decay_steps, inner_folds){
    # adjust evaluation metric to fit both numeric and factored targets
    summaryFunctionType <- if (is.numeric(data$income)) defaultSummary else multiClassSummary
    # metric: train() uses per default RSME and Accuracy for numeric and factored targets

    #  set control args
    outer_control <- trainControl(method = "cv", number = outer_folds,
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)
        
    inner_control <- trainControl(method = "cv", number = inner_folds, 
                                  summaryFunction = summaryFunctionType,
                                  verboseIter = FALSE,
                                  allowParallel = TRUE)

    # Define the grid for hyperparameter tuning
    size_values <- seq(1, 10, length.out = size_steps)
    decay_values <- 10^seq(log10(0.0001), log10(0.01), length.out = decay_steps)

    # Create grid
    tunegrid <- expand.grid(size = size_values, decay = decay_values)

    # Initialize variables to store results
    outer_results <- list()

    outer_cv_folds = createFolds(data$income, k = outer_folds)
    
    # Outer loop: Cross-validation for model evaluation
    for (i in seq_along(outer_folds)) {
        
        # Split data into outer folds
        outer_test_index = outer_cv_folds[[i]]
        outer_testData = data[outer_test_index,]
        outer_trainData  = data[-outer_test_index,]
        
        # Hyperparameter tuning using inner CV
        # No need for inner loop because "train" does k-fold CV already
        mlp_model <- caret::train(income ~ ., 
                           data = outer_trainData, 
                           method = "nnet", 
                           tuneGrid = tunegrid, 
                           trControl = inner_control)#,
                           #metric = metricType)
            

        # Store the best hyperparameters
        best_hyperparameters <- mlp_model$bestTune
        print("best HP")
        print(mlp_model$bestTune)

        # Train the final model on the outer training set with the best hyperparameters
        final_model <- caret::train(income ~ ., 
                             data = outer_trainData, 
                             method = "nnet", 
                             trControl = outer_control, 
                             tuneGrid = best_hyperparameters)

        # Testing the final model on the outer test set
        predictions <- predict(final_model, newdata = outer_testData)
        
        if (is.numeric(data$income)) {
            eval <- postResample(predictions, outer_testData$income) # postResample is a useful caret function
        } else if (is.factor(data$income)) {
            eval <- confusionMatrix(predictions, outer_testData$income)
        } else {
            stop("The predicted target has to be numeric or factor.")
        }

        # Store the evaluation metrics for this outer fold
        outer_results[[i]] <- eval
    }

    # Average the evaluation metrics over the outer folds
    eval_avg_outer_fold <- mean(unlist(outer_results)) # Calculate the mean performance over all outer folds

    # Return the average evaluation metrics
    return(eval_avg_outer_fold)
}

In [8]:
res <- mlp_pred(data = adult, outer_folds = 5, size_steps = 10, decay_steps = 10, inner_folds = 5)

# weights:  86
initial  value 15394.690067 
iter  10 value 10822.179303
final  value 10822.173596 
converged
# weights:  171
initial  value 10998.022953 
final  value 10777.589082 
converged
# weights:  256
initial  value 23482.429932 
final  value 10831.903831 
converged
# weights:  341
initial  value 16745.274127 
final  value 10798.506302 
converged
# weights:  426
initial  value 14362.514088 
iter  10 value 10822.189281
final  value 10819.397986 
converged
# weights:  511
initial  value 16745.694159 
final  value 10777.642821 
converged
# weights:  596
initial  value 24571.985627 
iter  10 value 10690.843509
final  value 10690.667192 
converged
# weights:  681
initial  value 29136.502647 
iter  10 value 10818.011610
final  value 10818.010920 
converged
# weights:  766
initial  value 10965.759162 
final  value 10717.585007 
converged
# weights:  851
initial  value 20225.280082 
iter  10 value 10815.399515
iter  20 value 10544.490542
iter  30 value 9957.622489
iter  40 value 9904.518

In [None]:
res

In [17]:
??mlpWeightDecay

No vignettes or demos or help files found with alias or concept or
title matching 'mlpWeightDecay' using fuzzy matching.

# Just the MLP prediction

In [7]:
mlp_simulation <- function(data, nrun = 10, k_fold = 10, steps = 10){
  # Create empty list to store evaluation dataframes
  eval_list <- list()
  
  # Set initial seed
  s <- 1234
  
  for (i in 1:nrun){
    set.seed(s + i)
    
    # Define trainControl with k-fold cross-validation
    train_control <- trainControl(method = "cv", number = k_fold)
    
    # Define the grid for hyperparameter tuning
    size_values <- seq(1, 10, length.out = steps)
    decay_values <- 10^seq(log10(0.0001), log10(0.01), length.out = steps)
    
    # Create grid
    grid <- expand.grid(size = size_values, decay = decay_values)
    
    # Train the MLP model using caret with grid search
    mlp_model <- train(income ~ ., data = data, method = "nnet",
                       tuneGrid = grid, trControl = train_control, linout = TRUE, trace = FALSE)
    
    # Evaluation metrics
    predictions <- predict(mlp_model, newdata = data)
    
    if (is.numeric(data$income)) {
      eval <- as.data.frame(evaluation_metrics_cont(predictions, data$income))
    } else if (is.factor(data$income)) {
      eval <- as.data.frame(evaluation_metrics_factor(predictions, data$income))
    } else {
      stop("The predicted target has to be numeric or factor.")
    }
    
    eval_list[[i]] <- eval
    print(paste("run", i, "completed"))
  }
  
  # Average over all runs
  sum_df <- Reduce(function(x, y) Map(`+`, x, y), eval_list)
  eval_avg <- lapply(sum_df, function(col) col / length(eval_list))
  
  # Convert the list back to a dataframe
  # Store row names
  rownames <- row.names(eval_list[[1]])
  
  # Convert the list back to a dataframe
  eval_avg <- as.data.frame(eval_avg)
  
  # Set back the row names
  row.names(eval_avg) <- rownames
  
  # Returns
  results <- list(eval_avg = eval_avg)
  return(results)
}

# Example function for continuous evaluation metrics
evaluation_metrics_cont <- function(predictions, actuals) {
  mse <- mean((predictions - actuals)^2)
  rmse <- sqrt(mse)
  mae <- mean(abs(predictions - actuals))
  return(data.frame(MSE = mse, RMSE = rmse, MAE = mae))
}

# Example function for factor evaluation metrics
evaluation_metrics_factor <- function(predictions, actuals) {
  confusion <- confusionMatrix(predictions, actuals)
  accuracy <- confusion$overall['Accuracy']
  return(data.frame(Accuracy = accuracy))
}

In [8]:
cps_res <- mlp_simulation(cpspop)

In [None]:
adult_res <- mlp_simulation(adult)

### Save the results

In [None]:
# Bind results
mlp_pred_results <- list(cps_res = cps_res, adult_res = adult_res)
# File pth for output
file <- "/user/emma.foessing01/u11969/results/mlp_pred_results.RData" 
dir.create(dirname(output_file), recursive = TRUE, showWarnings = FALSE) # create dir if not there
# Save the results to an RData file 
save(results, file = output_file)