### Split Data into Train and Test dataset

#### Input 
* winequality-red.csv

#### Output
* train.parquet
* test.parquet

In [1]:
# params
random_seed = 123
percent_train = 0.8

In [2]:
suppressMessages(library(dplyr))

# file and directory info
file_name = 'winequality-red.csv'
project_dir = dirname(getwd())
raw_folder = "/data/raw/"
interim_folder = "/data/interim/"

# input
raw_data = data.table::fread(paste0(project_dir, raw_folder, file_name), fill = TRUE) %>%
dplyr::as_tibble(.)

# functions
SplitData = function(raw_data, random_seed, percent_train){
    set.seed(random_seed)
    
    train_index = sample(1:nrow(raw_data), percent_train * nrow(raw_data))
    test_index = setdiff(1:nrow(raw_data), train_index)

    train = raw_data[train_index, ]
    test = raw_data[test_index, ]
    
    return(list(train = train, test = test))
}

# ouptut and save
train_test = SplitData(raw_data, random_seed, percent_train)
arrow::write_parquet(train_test$train, paste0(project_dir, interim_folder, "train.parquet"))
arrow::write_parquet(train_test$test, paste0(project_dir, interim_folder, "test.parquet"))

In [3]:
dim(raw_data)
head(raw_data)

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
7.4,0.66,0.0,1.8,0.075,13,40,0.9978,3.51,0.56,9.4,5


In [4]:
lapply(train_test, dim)
lapply(train_test, head)

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
8.8,0.52,0.34,2.7,0.087,24,122,0.9982,3.26,0.61,9.5,5
11.0,0.26,0.68,2.55,0.085,10,25,0.997,3.18,0.61,11.8,5
7.0,0.805,0.0,2.5,0.068,7,20,0.9969,3.48,0.56,9.6,5
10.4,0.64,0.24,2.8,0.105,29,53,0.9998,3.24,0.67,9.9,5
7.6,0.55,0.21,2.2,0.071,7,28,0.9964,3.28,0.55,9.7,5
12.0,0.63,0.5,1.4,0.071,6,26,0.99791,3.07,0.6,10.4,4

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
7.9,0.6,0.06,1.6,0.069,15,59,0.9964,3.3,0.46,9.4,5
8.9,0.62,0.18,3.8,0.176,52,145,0.9986,3.16,0.88,9.2,5
7.6,0.39,0.31,2.3,0.082,23,71,0.9982,3.52,0.65,9.7,5
7.9,0.43,0.21,1.6,0.106,10,37,0.9966,3.17,0.91,9.5,5
7.6,0.41,0.24,1.8,0.08,4,11,0.9962,3.28,0.59,9.5,5
