### Create Encoder from training data
* Here is about how we are going to process data based on the information in train dataset. It can be encoders for the text and categorical variables or mean and standard deviation for scaling numeric variables or imputation values for the missing values. 
* The output from this step will be used to transform data into the format that the model can train or predict on.

#### Input 
* train.parquet

#### Output
* encoder_num.rds

In [1]:
suppressMessages(library(dplyr))

# params
method = "mean"

In [2]:
# file and directory info
project_dir = dirname(getwd())
interim_folder = "/data/interim/"
processed_folder = "/data/processed/"

# input
train = arrow::read_parquet(paste0(project_dir, interim_folder, "train.parquet"))
num_variables = train %>% select(-quality) %>% names(.)

# functions
Encoder_numeric = function(train, num_variables, method){
    # select numeric variables
    train_num = train %>% select(!!!num_variables)
    
    # mean and std for later normalization
    summary_mean = train_num %>% summarise_at(num_variables, mean, na.rm = TRUE)
    summary_std = train_num %>% summarise_at(num_variables, sd, na.rm = TRUE)

    # method for missing values imputation
    if (method == "mean") {
        summary_num = summary_mean
    }
    if (method == "median") {
         summary_num = train_num %>% summarise_at(num_variables, median, na.rm = TRUE)
    }
    
    # save the info to a list
    summary_num_list = list()
    for (i in 1:ncol(summary_num)){
        col_name = names(summary_num)[i]
        summary_num_list[[i]] = data.frame(mean = as.numeric(summary_mean[,col_name]), 
                                           std = as.numeric(summary_std[,col_name]), 
                                           impute = as.numeric(summary_num[,col_name]))
        
    }
    names(summary_num_list) = names(summary_num)
    summary_num_list_json = summary_num_list %>% jsonlite::toJSON(.)
    
    return(summary_num_list_json)
}

# output and save
encoder_num = Encoder_numeric(train, num_variables, method)
jsonlite::write_json(encoder_num, paste0(project_dir, processed_folder, "encoder_num.json"))

In [3]:
jsonlite::prettify(encoder_num)

{
    "fixed acidity": [
        {
            "mean": 8.3048,
            "std": 1.7407,
            "impute": 8.3048
        }
    ],
    "volatile acidity": [
        {
            "mean": 0.5251,
            "std": 0.1796,
            "impute": 0.5251
        }
    ],
    "citric acid": [
        {
            "mean": 0.2725,
            "std": 0.1979,
            "impute": 0.2725
        }
    ],
    "residual sugar": [
        {
            "mean": 2.5286,
            "std": 1.3686,
            "impute": 2.5286
        }
    ],
    "chlorides": [
        {
            "mean": 0.0872,
            "std": 0.0483,
            "impute": 0.0872
        }
    ],
    "free sulfur dioxide": [
        {
            "mean": 15.6861,
            "std": 10.3205,
            "impute": 15.6861
        }
    ],
    "total sulfur dioxide": [
        {
            "mean": 45.7189,
            "std": 32.7308,
            "impute": 45.7189
        }
    ],
    "density": [
        {
            "mea