### Prepare Data for Model 

* Convert data to the format that the model can consume. This preparation process should be consistent for train, test and prediction dataset. 

#### Input
* train.parquet
* test.parquet
* preprocessor.pkl

#### Output
* train_X.parquet
* train_Y.parquet
* test_X.parquet
* test_Y.parquet

In [1]:
import os
import pandas as pd
import pyarrow 
import cloudpickle
from sklearn.preprocessing import StandardScaler, MinMaxScaler

project_dir = os.path.dirname(os.getcwd())
interim_folder = "/data/interim/"
processed_folder = "/data/processed/"
var_y = 'quality'

train = pd.read_parquet(project_dir + interim_folder + 'train.parquet')
test = pd.read_parquet(project_dir + interim_folder + 'test.parquet')

with open(project_dir + processed_folder + 'preprocessor_X.pkl', 'rb') as f:
            preprocessor_X = cloudpickle.load(f)
with open(project_dir + processed_folder + 'preprocessor_Y.pkl', 'rb') as f:
            preprocessor_Y = cloudpickle.load(f)

train_X_untransformed = train.drop([var_y], axis=1)
train_Y_untransformed = train[var_y].to_frame()
test_X_untransformed = test.drop([var_y], axis=1)
test_Y_untransformed = test[var_y].to_frame()

train_X = pd.DataFrame(preprocessor_X.transform(train_X_untransformed), columns = train_X_untransformed.columns)
train_Y = pd.DataFrame(preprocessor_Y.transform(train_Y_untransformed), columns = train_Y_untransformed.columns)
test_X = pd.DataFrame(preprocessor_X.transform(test_X_untransformed), columns = test_X_untransformed.columns)
test_Y = pd.DataFrame(preprocessor_Y.transform(test_Y_untransformed), columns = test_Y_untransformed.columns)

train_X.to_parquet(project_dir + interim_folder + 'train_X.parquet')
train_Y.to_parquet(project_dir + interim_folder + 'train_Y.parquet')
test_X.to_parquet(project_dir + interim_folder + 'test_X.parquet')
test_Y.to_parquet(project_dir + interim_folder + 'test_Y.parquet')

In [2]:
print(train_X.shape)
print(train_X.head(5))
print(train_Y.head(5))

(319, 11)
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       0.998629         -0.641701     0.642453       -0.052631  -0.375638   
1      -0.972326         -0.306215    -0.194047       -0.531471  -0.174621   
2      -0.803387          0.588416    -0.507735       -0.651181  -0.509649   
3      -1.084952         -0.977188    -0.037203        0.126934  -0.397973   
4       0.153934         -1.144931     0.956141       -0.711036  -0.576655   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0             2.480130              0.297186  0.378143  0.556825   1.065540   
1             0.227783             -0.368677 -0.984600  0.624908  -0.384672   
2            -0.849427              0.164013 -0.835394 -1.485650  -0.384672   
3            -0.751499             -0.035746 -1.462057 -0.600577  -0.607781   
4            -0.457714             -0.668316 -0.089367  0.420660   0.507766   

    alcohol  
0  0.084631  
1  0.177994  
2 -1

In [3]:
print(test_X.shape)
print(test_X.head(5))
print(test_Y.head(5))

(1280, 11)
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       0.886003         -0.306215     0.851578        0.186789  -0.442643   
1       0.604438          0.308844    -0.037203       -0.172341  -0.062945   
2      -0.634448          2.209934    -0.925985       -0.471616  -0.241626   
3      -0.127631         -1.592246     0.747016       -0.411761  -0.353303   
4      -0.296570         -0.809444    -0.246328       -0.471616  -0.174621   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0            -1.143211             -1.234299 -0.019738  0.216413  -0.273117   
1             0.717423              3.293569  0.517401 -0.192082  -0.663558   
2             0.423639              0.363772 -0.029685  0.352578  -0.607781   
3            -0.653571             -0.934661 -1.501845 -0.600577  -0.719336   
4             0.717423              0.130720 -0.238573 -0.124000   0.005770   

    alcohol  
0  1.671796  
1 -0.942358  
2 -