### Prepare Data for Model 

* Convert data to the format that the model can consume. This preparation process should be consistent for train, test and prediction dataset. 

#### Input
* train.parquet
* test.parquet
* encoder_num.json

#### Output
* train_X.parquet
* train_Y.parquet
* test_X.parquet
* test_Y.parquet

In [1]:
suppressMessages(library(dplyr))

# params
data_name = "train.parquet"
# data_name = "test.parquet"
type = "train"

In [2]:
# file and directory info
project_dir = dirname(getwd())
if (type == "train"){
    interim_folder = "/data/interim/"
    processed_folder = "/data/processed/"

    data_path = interim_folder
    encoder_path = processed_folder
} else if (type == "pred"){
    tmp_folder = "/tmp/"
    artifacts_folder = "/artifacts/"

    data_path = tmp_folder
    encoder_path = artifacts_folder
} else {
     stop('the value for args[2] is not supported.')
}    

# input
data = arrow::read_parquet(paste0(project_dir, data_path, data_name))
encoder_num = jsonlite::fromJSON(paste0(project_dir, encoder_path, "encoder_num.json")) %>% 
    jsonlite::fromJSON(.)

# functions 
PrepareData = function(data, encoder_num, var_y){
    # numeric variables
    var_num = names(encoder_num)
    # impute missing values with known value
    data_var_impute = purrr::map2(data[, var_num], encoder_num, function(.x, .y) ifelse(is.na(.x), .y$impute, .x)) %>% 
                                  as_tibble(.)
    # normalized data with known mean and std
    data_var_scale = purrr::map2(data_var_impute[, var_num], encoder_num, function(.x, .y) (.x - .y$mean)/.y$std ) %>% 
                                 as_tibble(.)
                                
    X = data_var_scale
                                 
    # response variable
    if (is.null(var_y)){
        return(list(X = X))
    } else {
        Y = data %>% select(!!!var_y)
        return(list(X = X, Y = Y))
    }
}
                                 
# output and save
var_y = "quality"
if (!var_y %in% names(data)){
    var_y = NULL
}
feature_data = PrepareData(data, encoder_num, var_y = var_y)
name = unlist(strsplit(data_name, "[.]"))[1]
  
arrow::write_parquet(feature_data$X, paste0(project_dir, data_path, name, "_X.parquet"))
arrow::write_parquet(feature_data$Y, paste0(project_dir, data_path, name, "_Y.parquet"))

In [3]:
lapply(feature_data, head)
lapply(feature_data, dim)

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0.2844833,-0.02839644,0.3410814,0.12523747,-0.004140787,0.8055714,2.3305602,0.7894737,-0.3292916,-0.28095238,-0.8819393
1.5483426,-1.47605791,2.0591208,0.01563642,-0.045548654,-0.550952,-0.6330093,0.1578947,-0.8398213,-0.28095238,1.246114
-0.7495835,1.55846325,-1.3769581,-0.02089727,-0.397515528,-0.8416356,-0.7857706,0.1052632,1.074665,-0.57857143,-0.7894152
1.2036537,0.63975501,-0.1642244,0.19830484,0.368530021,1.2900441,0.2224541,1.6315789,-0.4569241,0.07619048,-0.5118431
-0.4048946,0.13864143,-0.3158161,-0.24009937,-0.335403727,-0.8416356,-0.5413525,-0.1578947,-0.2016592,-0.63809524,-0.6968912
2.1228242,0.58407572,1.1495705,-0.82463832,-0.335403727,-0.9385301,-0.602457,0.6368421,-1.5417996,-0.34047619,-0.0492228

quality
5
5
5
5
5
4
