### Split Data to Train and Test dataset

#### Input 
* wine-quality.csv

#### Output
* train.parquet
* test.parquet

In [1]:
suppressMessages(library(dplyr))

# params
random_seed = 123
percent_train = 0.8

In [2]:
# file and directory info
file_name = 'wine-quality.csv'
project_dir = '/Users/chou/Desktop/mlflow_dvc_cookiecutter/DSProjectTemplate'
raw_folder = "/data/raw/"
interim_folder = "/data/interim/"

# input
raw_data = data.table::fread(paste0(project_dir, raw_folder, file_name), fill = TRUE)

# functions
SplitData = function(raw_data, random_seed, percent_train){
    set.seed(random_seed)
    
    train_index <- sample(1:nrow(raw_data), percent_train * nrow(raw_data))
    test_index <- setdiff(1:nrow(raw_data), train_index)

    train = raw_data[train_index, ]
    test = raw_data[test_index, ]
    
    return(list(train = train, test = test))
}

# ouptut and save
train_test = SplitData(raw_data, random_seed, percent_train)
arrow::write_parquet(train_test$train, paste0(project_dir, interim_folder, "train.parquet"))
arrow::write_parquet(train_test$test, paste0(project_dir, interim_folder, "test.parquet"))

In [3]:
head(raw_data)

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
