### Feature

* Convert data to the format that the model can consume.

#### Input
* train.parquet
* test.parquet
* encoder_num.rds

#### Output
* train_X.parquet
* train_Y.parquet
* test_X.parquet
* test_Y.parquet

In [1]:
suppressMessages(library(dplyr))

In [2]:
# file and directory info
project_dir = '/Users/chou/Desktop/mlflow_dvc_cookiecutter/DSProjectTemplate'
interim_folder = "/data/interim/"
processed_folder = "/data/processed/"

# input
train = arrow::read_parquet(paste0(project_dir, interim_folder, "train.parquet"))
test = arrow::read_parquet(paste0(project_dir, interim_folder, "test.parquet"))
encoder_num = readRDS(paste0(project_dir, processed_folder, "encoder_num.rds"))

# functions 
Featurize_train = function(train, encoder_num){
    # numeric
    var_num = names(encoder_num)
    train_var_impute = purrr::map2(train[, var_num], encoder_num, function(.x, .y) ifelse(is.na(.x), .y$impute, .x)) %>% as_tibble(.)
    train_var_scale = purrr::map2(train_var_impute[, var_num], encoder_num, function(.x, .y) (.x - .y$mean)/.y$std ) %>% as_tibble(.)
                                
    X = train_var_scale
    Y = train %>% select(quality)
                               
    return(list(X = X, Y = Y))
}

Featurize_test = function(test, encoder_num){
    # numeric
    var_num = names(encoder_num)
    test_var_impute = purrr::map2(test[, var_num], encoder_num, function(.x, .y) ifelse(is.na(.x), .y[[1]], .x)) %>% as_tibble(.)
    test_var_scale = purrr::map2(test_var_impute[, var_num], encoder_num, function(.x, .y) (.x - .y$mean)/.y$std ) %>% as_tibble(.)
                                  
    X = test_var_scale 
    Y = test %>% select(quality)
                              
    return(list(X = X, Y = Y))
}

# output and save
feature_train = Featurize_train(train, encoder_num)
feature_test = Featurize_test(test, encoder_num)
                                  
arrow::write_parquet(feature_train$X, paste0(project_dir, interim_folder, "train_X.parquet"))
arrow::write_parquet(feature_train$Y, paste0(project_dir, interim_folder, "train_Y.parquet"))
arrow::write_parquet(feature_test$X, paste0(project_dir, interim_folder, "test_X.parquet"))
arrow::write_parquet(feature_test$Y, paste0(project_dir, interim_folder, "test_Y.parquet"))

In [3]:
lapply(feature_train, head)
lapply(feature_test, head)

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1.7379041,-0.28473142,-0.03220029,-0.7562286,0.33212002,-1.3475303,-1.5664838,0.01658686,-1.967553,-0.09639526,-0.8277115
-0.7697374,0.01087568,0.95527527,0.2348019,-0.03159054,0.6197474,1.5283594,0.58147614,0.4631375,-0.26942032,-1.0703769
1.0214351,0.01087568,2.02504045,1.14655,0.01387328,1.4298029,0.9141157,1.44050894,-0.7193606,-0.26942032,-1.3130422
-1.1279719,0.40501848,0.46153749,-0.5976637,3.10541303,-0.653197,0.0399997,-0.1973357,-0.6536662,-0.35593285,-1.0703769
-1.3667949,-0.18619572,-0.77280696,2.2763247,0.60490294,-0.7110581,-0.3379964,1.24998416,0.7259149,0.07662981,-0.3423809
-0.1726799,-0.97448132,-0.44364844,0.7699584,-0.30437346,-0.3638914,-0.5506192,0.23385197,-0.5222776,-0.35593285,0.3047267

quality
5
6
5
6
5
7


fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1.49908111,0.01087568,0.5438271,0.1158782,0.19572856,-0.3060303,-0.975864859,0.3708961,0.4631375,-0.44244538,-0.3423809
0.18555461,-0.08766002,0.2146686,2.8511224,-0.03159054,0.5618863,0.0,2.3429948,0.0,-0.35593285,-1.3939307
-0.05326839,-0.18619572,0.7084064,-0.9147935,0.15026474,0.3304419,-0.385245929,-0.3310373,1.8427187,-0.09639526,0.0
0.90202361,3.853768,-1.5957033,-0.9544347,1.28686023,-0.5953359,0.701492903,-0.0970595,-0.9164436,0.16314234,0.0
0.18555461,0.50355418,-0.1144899,-1.0138966,0.33212002,0.1568585,-0.007249813,-1.1332469,-0.3908889,-1.82664586,0.547392
-0.76973739,1.78451829,0.0,-0.379637,0.92314968,1.5455251,1.622858434,-0.0302087,0.3974432,0.24965487,-0.5850462

quality
6
6
8
5
6
5


In [4]:
lapply(feature_train, dim)
lapply(feature_test, dim)