# Bayesian Network Prediction Model

## Libraries

In [23]:
install.packages("gRain")

installiere auch Abh"angigkeiten 'gRbase', 'RcppArmadillo'





Die heruntergeladenen Bin"arpakete sind in 
	/var/folders/kj/dkjqkk2n3wq2zfbttgdpjrj80000gn/T//Rtmpcs9zeZ/downloaded_packages


In [24]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "ranger", "bnlearn", "arulesCBA", "igraph", "gRain")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

Lade n"otiges Paket: gRbase


Attache Paket: 'gRbase'


Die folgenden Objekte sind maskiert von 'package:igraph':

    edges, is_dag, topo_sort


Die folgenden Objekte sind maskiert von 'package:bnlearn':

    ancestors, children, nodes, parents


Die folgenden Objekte sind maskiert von 'package:party':

    fit, nodes


Das folgende Objekt ist maskiert 'package:modeltools':

    fit




## Data

In [2]:
load(paste0(here(), "/cpspop.RData"))
adult <- read.csv(paste0(here(), "/adult_preprocessed.csv"))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)


adult_with_cont <- adult
cps_with_cont <- cpspop

--> remake in case it does not work with all factor levels. but make sure to already discretize before synthetization!
# abjsut amount of levels in categorical variables
cpspop$educ <- as.numeric(levels(cpspop$educ))[cpspop$educ]

educ <- rep(NA, nrow(cpspop))

# Assign new categorical values based on the original 'educ' levels
educ[cpspop$educ < 39] <- 1
educ[cpspop$educ == 39] <- 2
educ[cpspop$educ > 39 & cpspop$educ < 44] <- 3
educ[cpspop$educ >= 44] <- 4

# Convert educ to factor and assign labels
cpspop$educ <- as.factor(educ)
# levels(cpspop$educ) <- c("No high school degree", "Finished high school", "Associate or bachelor's degree", "Master's degree or higher")

# Update race levels
cpspop$race[cpspop$race == 3] <- 2
cpspop$race[cpspop$race == 4] <- 2
cpspop$race <- factor(cpspop$race)

# Update marital status levels
cpspop$marital[cpspop$marital == 2] <- 1
cpspop$marital[cpspop$marital == 3] <- 4
cpspop$marital <- as.factor(cpspop$marital)
cpspop$marital <- droplevels(cpspop$marital)
levels(cpspop$marital) <- c("1", "2", "3", "4", "5")

# Convert sex to factor
cpspop$sex <- factor(cpspop$sex)

## Helper functions

In [3]:
evaluation_metrics_factor <- function(predictions, test_set) {
    # Ensure test_set is a data frame
    test_set <- as.data.frame(test_set)
    
    # Ensure both predictions and test_set$income are factors with the same levels
    predictions <- as.factor(predictions)
    reference <- as.factor(test_set$income)
    
    # Ensure levels match between predictions and reference
    levels(predictions) <- levels(reference)
    
    # Confusion matrix for the prediction on original data
    cm <- caret::confusionMatrix(predictions, reference, mode = "everything")

    # Saving evaluation metrics
    accuracy <- cm$overall['Accuracy']
    
    if (length(levels(reference)) == 2) {
        # Binary classification
        f1 <- cm$byClass['F1']
        sens <- cm$byClass['Sensitivity']
        spec <- cm$byClass['Specificity']
    } else {
        # Multi-class classification: calculate metrics for each class and take the mean
        f1 <- mean(cm$byClass[,'F1'], na.rm = TRUE)
        sens <- mean(cm$byClass[,'Sensitivity'], na.rm = TRUE)
        spec <- mean(cm$byClass[,'Specificity'], na.rm = TRUE)
    }

    # Create the dataframe
    metrics_df <- data.frame(
        Accuracy = accuracy, 
        F1 = f1, 
        Sensitivity = sens, 
        Specificity = spec
    )
    
    return(metrics_df)
}

In [4]:
discretize_df = function(df, breaks = 5) {
  for (var in colnames(df)) {
    # Check if the variable is not a factor
    if (!is.factor(df[[var]])) {

      # Count the frequency of each unique value
      freq_table <- table(df[[var]])

      # Calculate the proportion of zeros, ensuring NA is handled
      zero_proportion <- ifelse(!is.na(freq_table[as.character(0)]), 
                                freq_table[as.character(0)] / sum(freq_table), 
                                0)

      # Determine the number of breaks based on zero proportion
      if (zero_proportion > 4/5) {
        new_breaks = 1
      } else if (zero_proportion > 1/4) {
        new_breaks = breaks - 2
      } else if (zero_proportion > 1/5) {
        new_breaks = breaks - 1
      } else {
        new_breaks = breaks
      }
      
      # Separate zeros and non-zeros
      zero_portion = (df[[var]] == 0)
      non_zero_values = df[[var]][!zero_portion]

      # Discretize non-zero values
      if (length(non_zero_values) > 0) {
        # Calculate breaks for non-zero values
        range_values = range(non_zero_values, na.rm = TRUE)
        breaks_values = seq(range_values[1], range_values[2], length.out = new_breaks + 1)
        
        # Ensure correct number of labels are created
        labels = sapply(1:(length(breaks_values)-1), function(i) 
                        paste("(", breaks_values[i], "-", breaks_values[i+1], "]", sep=""))

        # Use cut to apply these breaks and labels
        discretized_non_zeros = cut(non_zero_values, breaks = breaks_values, labels = labels, include.lowest = TRUE)
        # Combine zero and discretized non-zeros into the original dataframe
        df[[var]] <- factor(ifelse(zero_portion, "0", as.character(discretized_non_zeros)))
      } else {
        # If all values are zero or the number of breaks is zero or negative
        df[[var]] <- factor("0")
      }
    }
  }
  return(df)
}

In [5]:
# Define cpdag_to_dag function
cpdag_to_dag <- function(cpdag) {
  adj_matrix <- amat(cpdag)
  ig <- graph_from_adjacency_matrix(adj_matrix, mode = "directed")
  if (igraph::is_dag(ig)) {
    return(cpdag)
  }
  directed_arcs <- directed.arcs(cpdag)
  undirected_arcs <- undirected.arcs(cpdag)
  while (nrow(undirected_arcs) > 0) {
    arc <- undirected_arcs[1, , drop = FALSE]
    cpdag <- set.arc(cpdag, from = arc[1, 1], to = arc[1, 2])
    undirected_arcs <- undirected.arcs(cpdag)
  }
  return(cpdag)
}

train_bn <- function(data, algorithm) {
  if (any(is.na(data))) {
    stop("The data contains missing values.")
  }
  
  # Train the Bayesian network using the specified algorithm
  if (algorithm == "hc") {
    bn <- bnlearn::hc(data)  # Using BIC by default
  } else if (algorithm == "tabu") {
    bn <- bnlearn::tabu(data)  # Using BIC by default
  } else if (algorithm == "gs") {
    bn <- bnlearn::gs(data)
    bn <- bnlearn::cpdag(bn)  # Convert to CPDAG
    bn <- cpdag_to_dag(bn)     # Convert CPDAG to DAG
  } else if (algorithm == "iamb") {
    bn <- bnlearn::iamb(data)
    bn <- bnlearn::cpdag(bn)  # Convert to CPDAG
    bn <- cpdag_to_dag(bn)     # Convert CPDAG to DAG
  } else {
    stop("Unsupported algorithm.")
  }
  
  # Print the node names of the learned network
  cat("Nodes in the learned Bayesian network:\n")
  print(nodes(bn))
  
  return(bn)
}


# Define a function to evaluate the Bayesian network model
evaluate_bn <- function(testData, bn_fitted, target_var) {
  # Generate predictions for the target variable
  predictions <- predict(bn_fitted, data = testData, node = target_var)
  
  # Compare the predictions with the actual values in the test data
  actual_values <- testData[[target_var]]
  
  # Calculate the accuracy
  accuracy <- mean(predictions == actual_values, na.rm = TRUE)
  
  # You can also calculate other metrics such as:
  confusion <- table(Predicted = predictions, Actual = actual_values)
  precision <- diag(confusion) / rowSums(confusion)
  recall <- diag(confusion) / colSums(confusion)
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  # Return a list of evaluation metrics
  return(list(
    accuracy = accuracy,
    precision = precision,
    recall = recall,
    f1_score = f1_score
  ))
}


In [6]:
bn_model <- list(
  # Required libraries
  library = "bnlearn",
  
  # Specify the type of model
  type = "Classification",
  
  # Model fitting function
  fit = function(x, y, wts, param, lev, last, classProbs, ...) {
    # Convert predictors (x) to a data frame, if not already
    x <- as.data.frame(x)
    
    # Combine predictors and response into a single data frame
    data <- cbind(x, income = y)
    
    # Sanitize column names to ensure they are valid node names for bnlearn
    colnames(data) <- make.names(colnames(data))
    
    # Print debug info to check column names (optional for troubleshooting)
    cat("Column names in data frame for bnlearn model fitting:", colnames(data), "\n")
    
    # Learn the structure of the Bayesian Network using bnlearn
    if (param$algorithm == "hc") {
      bn_structure <- bnlearn::hc(data)
    } else if (param$algorithm == "tabu") {
      bn_structure <- bnlearn::tabu(data)
    } else if (param$algorithm == "gs") {
      bn_structure <- bnlearn::gs(data)
    } else if (param$algorithm == "iamb") {
      bn_structure <- bnlearn::iamb(data)
    }
    
    # Fit the Bayesian Network with the structure and the data
    fitted_bn <- bnlearn::bn.fit(bn_structure, data)
    
    return(fitted_bn)
  },
  
  # Prediction function
  predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
    # Ensure newdata is a data frame and sanitize column names
    newdata <- as.data.frame(newdata)
    colnames(newdata) <- make.names(colnames(newdata))
    
    # Print debug info for newdata (optional for troubleshooting)
    cat("Column names in new data frame for bnlearn prediction:", colnames(newdata), "\n")
    
    # Predict the target variable using the fitted Bayesian Network model
    predictions <- predict(modelFit, node = "income", data = newdata, method = "bayes-lw")
    return(predictions)
  },
  
  # Probability function (for classification with probabilities)
  prob = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
    # Ensure newdata is a data frame and sanitize column names
    newdata <- as.data.frame(newdata)
    colnames(newdata) <- make.names(colnames(newdata))
    
    # Calculate prediction probabilities using the Bayesian Network
    prob_predictions <- predict(modelFit, node = "income", data = newdata, prob = TRUE, method = "bayes-lw")
    return(as.data.frame(prob_predictions))
  },
  
  # Define the tuning parameters
  parameters = data.frame(
    parameter = "algorithm",
    class = "character",
    label = "Algorithm"
  ),
  
  # Define the grid of possible values for tuning
  grid = function(x, y, len = NULL, search = "grid") {
    expand.grid(algorithm = c("hc", "tabu", "gs", "iamb"))
  },
  
  # Label the levels of the outcome variable (for classification)
  levels = function(x) levels(x$income),
  
  # Sort out model-specific issues, if any
  sort = function(x) x
)

In [31]:
inner_cv <- function(data, target_var, folds, algorithms) {
  # Create inner folds
  inner_folds <- createFolds(data[[target_var]], k = folds)
  
  best_model <- NULL
  best_performance <- -Inf
  best_algorithm <- NULL
  
  for (algorithm in algorithms) {
    cat("Trying algorithm:", algorithm, "\n")
    fold_results <- c()
    
    for (i in seq_along(inner_folds)) {
      inner_test_index <- inner_folds[[i]]
      inner_trainData <- data[-inner_test_index, ]
      inner_testData <- data[inner_test_index, ]
      
      # Fit Bayesian Network model using bnlearn algorithm
      bn_model <- do.call(get(algorithm, envir = asNamespace("bnlearn")), list(inner_trainData))
      
      # Fit the model to the training data
      fitted_bn_model <- bnlearn::bn.fit(bn_model, inner_trainData)
      
      # Use Bayesian Likelihood Weighting for prediction
      predictions <- predict(fitted_bn_model, node = target_var, data = inner_testData, method = "bayes-lw")
      
      # Handle missing levels in prediction
      predictions <- factor(predictions, levels = levels(inner_trainData[[target_var]]))
      
      # Calculate the accuracy
      accuracy <- mean(predictions == inner_testData[[target_var]], na.rm = TRUE)
      fold_results[i] <- accuracy
    }
    
    # Average performance for this algorithm
    avg_performance <- mean(fold_results, na.rm = TRUE)
    
    if (!is.na(avg_performance) && avg_performance > best_performance) {
      best_performance <- avg_performance
      best_model <- fitted_bn_model
      best_algorithm <- algorithm
    }
  }
  
  cat("Best algorithm selected:", best_algorithm, "with accuracy:", best_performance, "\n")
  return(best_model)
}

In [39]:
bn_pred <- function(data, outer_folds, inner_folds) {
  # Discretize the data
  data <- discretize_df(data)
  
  algorithms = c("hc", "tabu")  # You can add more algorithms here
  data$income <- factor(data$income, levels = unique(data$income))

  outer_results <- list()
  outer_cv_folds <- createFolds(data$income, k = outer_folds)
  
  for (i in seq_along(outer_cv_folds)) {
    outer_test_index <- outer_cv_folds[[i]]
    outer_testData <- data[outer_test_index, ]
    outer_trainData <- data[-outer_test_index, ]
    
    # Get the best fitted model from inner CV
    best_model <- inner_cv(outer_trainData, "income", inner_folds, algorithms)

    # Perform prediction using 'bayes-lw' method
    predictions <- predict(best_model, node = "income", data = outer_testData, method = "bayes-lw")
    
    # Evaluate the accuracy of the predictions (direct comparison without numeric threshold)
    accuracy <- mean(predictions == outer_testData$income, na.rm = TRUE)
    outer_results[[i]] <- accuracy
  }
  
  # Calculate average accuracy over all outer folds
  eval_avg_outer_folds <- mean(unlist(outer_results))
  
  return(eval_avg_outer_folds)
}

## Apply

In [40]:
cps_res <- bn_pred(cpspop, 2, 2)

Trying algorithm: hc 
Trying algorithm: tabu 
Best algorithm selected: hc with accuracy: 0.9604422 
Trying algorithm: hc 


"variable income in the data has levels that are not observed in the data."
"variable income in the data has levels that are not observed in the data."


Trying algorithm: tabu 


"variable income in the data has levels that are not observed in the data."
"variable income in the data has levels that are not observed in the data."


Best algorithm selected: hc with accuracy: 0.960374 


"dropping 2000 observations because generated samples are NAs."


In [41]:
cps_res

In [42]:
adult_res <- bn_pred(adult, 2, 2)

Trying algorithm: hc 


"variable native_country in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"dropping 500 observations because generated samples are NAs."
"dropping 2000 observations because generated samples are NAs."


Trying algorithm: tabu 


"variable native_country in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"dropping 3444 observations because generated samples are NAs."
"dropping 2000 observations because generated samples are NAs."


Best algorithm selected: tabu with accuracy: 0.8287508 


"dropping 4500 observations because generated samples are NAs."


Trying algorithm: hc 


"variable fnlwgt in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"variable fnlwgt in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"dropping 3500 observations because generated samples are NAs."
"variable fnlwgt in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"variable fnlwgt in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"dropping 1000 observations because generated samples are NAs."


Trying algorithm: tabu 


"variable fnlwgt in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"variable fnlwgt in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"dropping 412 observations because generated samples are NAs."
"variable fnlwgt in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"variable fnlwgt in the data has levels that are not observed in the data."
"variable native_country in the data has levels that are not observed in the data."
"dropping 1000 observations because generated samples are NAs."


Best algorithm selected: tabu with accuracy: 0.832129 


"dropping 2000 observations because generated samples are NAs."


In [43]:
adult_res