# Synthetic Data Generator with a Multi-Layer-Perceptron Model

## Libraries

In [11]:
list_of_packages <- c ("synthpop", "insight", "party", "haven", "dplyr", "rpart", "rpart.plot", "randomForest", "pROC", "caret", "pracma", "here", "Hmisc", "purrr", "randomForest", "caret", "ranger", "xgboost", "data.table", "nnet")

install_if_missing <- function(p){
  if(!requireNamespace(p, quietly = TRUE)){
    install.packages(p)
  }
  library(p, character.only=TRUE)
}


lapply(list_of_packages, install_if_missing)

## Data

In [12]:
load(file = (paste0(here(), "/cpspop.RData")))
adult <- read.csv(file = (paste0(here(),"/adult_preprocessed.csv")))
# delete NAs
adult[adult == "?"] <- NA
adult <- na.omit(adult)

adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
adult$occupation <- as.factor(adult$occupation)

## Synthetic Data

1. Prepare Data: Split your data into features and target. Normalize the features to ensure better performance of the neural network. <br>
2. Define and Train the MLP Model: Define the architecture of the MLP, compile the model, and train it on your data. <br>
3. Generate Synthetic Data: Sample from the feature space, use the trained model to predict the target variable for these samples, and combine the predictions with the sampled features to create synthetic data.

In [13]:
synthesize_data_mlp <- function(data, size = 5, maxit = 100) {

  # Initialize synthetic data frame
  syn_data <- data.frame(matrix(ncol = ncol(data), nrow = nrow(data)))
  colnames(syn_data) <- colnames(data)
  
  # Synthesize each variable sequentially
  for (var in colnames(data)) {
    # Identify predictors (all columns except the one being synthesized and those with any NAs)
    predictors <- colnames(syn_data)[!sapply(syn_data, anyNA) & colnames(syn_data) != var]
    
    # If there are no predictors (first variable), randomly sample from the original data
    if (length(predictors) == 0) {
      syn_data[[var]] <- sample(data[[var]], nrow(data), replace = TRUE)
    } else {
      # Prepare data for nnet
      train_data <- syn_data[, predictors, drop = FALSE]
      train_data[[var]] <- data[[var]]
      
      # Fit MLP using nnet
      model <- nnet(
        as.formula(paste(var, "~ .")),
        data = train_data,
        size = size,
        maxit = maxit,
        linout = !is.factor(data[[var]]),  # Use linear output for numeric data
        trace = FALSE
      )
      
      # Predict the values for the synthetic data
      if (is.factor(data[[var]])) {
        predictions <- predict(model, newdata = syn_data[, predictors, drop = FALSE], type = "class")
        syn_data[[var]] <- factor(predictions, levels = levels(data[[var]]))
      } else {
        predictions <- predict(model, newdata = syn_data[, predictors, drop = FALSE])
        syn_data[[var]] <- predictions
      }
    }
  }
  
  return(syn_data)
}

## Apply

### CPS

In [14]:
# Generate synthetic data for all target variables
synthetic_cpspop <- synthesize_data_mlp(cpspop)

# View the synthetic dataset
head(synthetic_cpspop)

Unnamed: 0_level_0,tax,income,csp,age,educ,marital,race,sex,ss
Unnamed: 0_level_1,<int>,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>",<fct>,<fct>,<fct>,<fct>,"<dbl[,1]>"
1,342,54105.73,142.6933,48.17255,39,1,1,1,2084.136
2,0,54105.73,142.6933,48.17255,39,1,1,1,2084.136
3,1600,54105.73,142.6933,48.17255,39,1,1,1,2084.136
4,2600,54105.73,142.6933,48.17255,39,1,1,1,2084.136
5,0,54105.73,142.6933,48.17255,39,1,1,1,2084.136
6,0,54105.73,142.6933,48.17255,39,1,1,1,2084.136


### Adult

In [15]:
# Generate synthetic data for all target variables
synthetic_adult <- synthesize_data_mlp(adult)

# View the synthetic dataset
head(synthetic_adult)

Unnamed: 0_level_0,age,workclass,fnlwgt,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
Unnamed: 0_level_1,<int>,<fct>,"<dbl[,1]>",<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,"<dbl[,1]>","<dbl[,1]>","<dbl[,1]>",<fct>,<fct>
1,27,2,189793.8,11,2,9,0,4,1,1092.008,88.3726,40.93124,38,<=50K
2,33,2,189793.8,11,2,9,0,4,1,1092.008,88.3726,40.93124,38,<=50K
3,50,2,189793.8,11,2,9,0,4,1,1092.008,88.3726,40.93124,38,<=50K
4,39,2,189793.8,11,2,9,0,4,1,1092.008,88.3726,40.93124,38,<=50K
5,64,2,189793.8,11,2,9,0,4,1,1092.008,88.3726,40.93124,38,<=50K
6,40,2,189793.8,11,2,9,0,4,1,1092.008,88.3726,40.93124,38,<=50K
