# Preprocessing Encoder, Pipeline, SequentialEncoder and FeatureUnion example

The easyflow.preprocessing module contains functionality similar to what sklearn does with its Pipeline, FeatureUnion and ColumnTransformer does. 

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import Normalization, CategoryEncoding, StringLookup

In [5]:
# local imports
from easyflow.data.mapper import TensorflowDataMapper
from easyflow.preprocessing.preprocessor import Encoder, Pipeline, SequentialEncoder, FeatureUnion
from easyflow.preprocessing.custom import IdentityPreprocessingLayer

## Read in data and map as tf.data.Dataset

Use the TensorflowDataMapper class to map pandas data frame to a tf.data.Dataset type

In [6]:
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
dataframe = pd.read_csv(file_url)
dataframe = dataframe.copy()
labels = dataframe.pop("target")

batch_size = 32
dataset_mapper = TensorflowDataMapper() 
dataset = dataset_mapper.map(dataframe, labels)
train_data_set, val_data_set = dataset_mapper.split_data_set(dataset)
train_data_set = train_data_set.batch(batch_size)
val_data_set = val_data_set.batch(batch_size)

## Set constants

In [7]:
NUMERICAL_FEATURES = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope']
CATEGORICAL_FEATURES = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'ca']
# thal is represented as a string
STRING_CATEGORICAL_FEATURES = ['thal']

## Setup Preprocessing layer using FeatureUnion

Use Encoder and SequentialEncoder to preprocess features by putting everything in a FeatureUnion object

In [16]:
feature_encoder_list = [
                        Encoder([('numeric_encoder', Normalization, NUMERICAL_FEATURES)]),
                        Encoder([('categorical_encoder', CategoryEncoding, CATEGORICAL_FEATURES)]),
                        # For feature thal we first need to run StringLookup followed by a CategoryEncoding layer
                        SequentialEncoder([('string_encoder', StringLookup, STRING_CATEGORICAL_FEATURES),
                                           ('categorical_encoder', CategoryEncoding, STRING_CATEGORICAL_FEATURES)])
                        ]


In [17]:
encoder = FeatureUnion(feature_encoder_list)
all_feature_inputs, preprocessing_layer = encoder.encode(dataset)

In [18]:
print(preprocessing_layer)

Tensor("concatenate_1/concat:0", shape=(None, 31), dtype=float32)


In [21]:
# setup simple network
x = tf.keras.layers.Dense(128, activation="relu")(preprocessing_layer)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=all_feature_inputs, outputs=outputs)
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'), tf.keras.metrics.AUC(name='auc')])
history=model.fit(train_data_set, validation_data=val_data_set, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## More flexibility with Pipeline

FeatureUnion subclasses Pipeline and concatenates(i.e union) the layers. For more flexibility like a wide and deep neural network, Pipeline class will give you more flexibility in that sense

In [28]:
feature_encoder_list = [
                        Encoder([('numeric_encoder', Normalization, NUMERICAL_FEATURES)]),
                        Encoder([('categorical_encoder', CategoryEncoding, CATEGORICAL_FEATURES)]),
                        # For feature thal we first need to run StringLookup followed by a CategoryEncoding layer
                        SequentialEncoder([('string_encoder', StringLookup, STRING_CATEGORICAL_FEATURES),
                                           ('categorical_encoder', CategoryEncoding, STRING_CATEGORICAL_FEATURES)])
                        ]

encoder = Pipeline(feature_encoder_list)
all_feature_inputs1, preprocessing_layer1 = encoder.encode(dataset)

In [24]:
print(preprocessing_layer)

[<tf.Tensor 'normalization_43/truediv:0' shape=(None, 1) dtype=float32>, <tf.Tensor 'normalization_44/truediv:0' shape=(None, 1) dtype=float32>, <tf.Tensor 'normalization_45/truediv:0' shape=(None, 1) dtype=float32>, <tf.Tensor 'normalization_46/truediv:0' shape=(None, 1) dtype=float32>, <tf.Tensor 'normalization_47/truediv:0' shape=(None, 1) dtype=float32>, <tf.Tensor 'normalization_48/truediv:0' shape=(None, 1) dtype=float32>, <tf.Tensor 'category_encoding_31/bincount/DenseBincount:0' shape=(None, 2) dtype=float32>, <tf.Tensor 'category_encoding_32/bincount/DenseBincount:0' shape=(None, 5) dtype=float32>, <tf.Tensor 'category_encoding_33/bincount/DenseBincount:0' shape=(None, 2) dtype=float32>, <tf.Tensor 'category_encoding_34/bincount/DenseBincount:0' shape=(None, 3) dtype=float32>, <tf.Tensor 'category_encoding_35/bincount/DenseBincount:0' shape=(None, 2) dtype=float32>, <tf.Tensor 'category_encoding_36/bincount/DenseBincount:0' shape=(None, 4) dtype=float32>, <tf.Tensor 'category_