### Create Encoder from training data
* Here is about how we are going to process data based on the information in train dataset. It can be encoders for the text and categorical variables or mean and standard deviation for scaling numeric variables or imputation values for the missing values. 
* The output from this step will be used to transform data into the format that the model can train or predict on.

#### Input 
* train.parquet

#### Output
* preprocessor.pkl

In [1]:
method = "mean" # or "median" 

In [2]:
import os 
import pandas as pd
import cloudpickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor

# file and directory info
project_dir = os.path.dirname(os.getcwd())
interim_folder = "/data/interim/"
processed_folder = "/data/processed/"
var_y = 'quality'

train = pd.read_parquet(project_dir + interim_folder + 'train.parquet')
var_numeric_x = train.columns.tolist()
var_numeric_x.remove(var_y)

In [3]:
preprocessor_X = Pipeline(steps=[
    ('x_imputer', SimpleImputer(strategy = method)),
    ('x_scaler', StandardScaler())
])

preprocessor_Y = Pipeline(steps=[
    ('y_scaler', MinMaxScaler())
])

train_X_untransformed = train.drop([var_y], axis=1)
train_Y_untransformed = train[var_y].to_frame()

preprocessor_X.fit(train_X_untransformed)
preprocessor_Y.fit(train_Y_untransformed)

with open(project_dir + processed_folder + 'preprocessor_X.pkl', 'wb') as f:
    cloudpickle.dump(preprocessor_X, f)
with open(project_dir + processed_folder + 'preprocessor_Y.pkl', 'wb') as f:
    cloudpickle.dump(preprocessor_Y, f)

In [4]:
preprocessor_X

Pipeline(memory=None,
         steps=[('x_imputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('x_scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True))],
         verbose=False)

In [5]:
preprocessor_Y

Pipeline(memory=None,
         steps=[('y_scaler', MinMaxScaler(copy=True, feature_range=(0, 1)))],
         verbose=False)