<a href="https://colab.research.google.com/github/gnovack/tf-one-hot-encoder/blob/master/OneHotEncoderLayer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow.keras import layers, models

## One Hot Encoding using Sci-kit Learn's OneHotEncoder Class

In [2]:
colors_df = pd.DataFrame(data=[['red'],['blue'],['green'],['blue']], columns=['color'])
print('Before One Hot Encoding:')
display(colors_df)

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(colors_df)

colors_df_encoded = one_hot_encoder.transform(colors_df)
colors_df_encoded = pd.DataFrame(data=colors_df_encoded, columns=one_hot_encoder.categories_)
print('\n\nAfter One Hot Encoding:')
display(colors_df_encoded)

Before One Hot Encoding:


Unnamed: 0,color
0,red
1,blue
2,green
3,blue




After One Hot Encoding:


Unnamed: 0,blue,green,red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0


## One Hot Encoding in a Sci-kit Learn Pipeline
A simple Pipeline that bundles One Hot Encoding logic with a Linear Regression model.

In [3]:
labels = [10,4,6,4]

linear_regression = LinearRegression()
pipeline = Pipeline(steps=[('one_hot_encoder',one_hot_encoder), ('linear_regression', linear_regression)])
pipeline.fit(colors_df, labels)

print('Red Prediction:', pipeline.predict([['red']])[0])
print('Blue Prediction:', pipeline.predict([['blue']])[0])

Red Prediction: 10.0
Blue Prediction: 3.9999999999999987


## One Hot Encoding using the tf.one_hot Operation
The tf.one_hot operation converts integer indices into One Hot Encoded features

In [4]:
category_indices = [0, 1, 2, 2, 1, 0]
unique_category_count = 3
inputs = tf.one_hot(category_indices, unique_category_count)
print(inputs.numpy())
# [[1. 0. 0.] <- category '0'
#  [0. 1. 0.] <- category '1'
#  [0. 0. 1.] <- category '2'
#  [0. 0. 1.].     ...
#  [0. 1. 0.].     ...
#  [1. 0. 0.]].    ...

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


## TextVectorization Layer
The TextVectorization Class can be used to convert string categories into integer indices

In [5]:
text_vectorization = layers.experimental.preprocessing.TextVectorization(output_sequence_length=1)
text_vectorization.adapt(colors_df.values)

print('Red index:', text_vectorization.call([['red']]))
print('Blue index:', text_vectorization.call([['blue']]))
print('Green index:', text_vectorization.call([['green']]))

print(text_vectorization.get_vocabulary()) # prints [b'blue', b'red', b'green']

Red index: tf.Tensor([[3]], shape=(1, 1), dtype=int64)
Blue index: tf.Tensor([[2]], shape=(1, 1), dtype=int64)
Green index: tf.Tensor([[4]], shape=(1, 1), dtype=int64)
[b'blue', b'red', b'green']


## The OneHotEncodingLayer Class
The layer can be used as part of a model to One Hot Encode categorical input features.

*   `adapt()` fits the layer to the categorical inputs
*   `call()` invokes the layer and One Hot Encodes the inputs based on the vocabulary learned during the call to `adapt()`
* `get_config()` returns a configuration dict that represents the state of the layer. This includes the vocabulary (e.g. `['red','green','blue']`), the depth (or the number of unique categories), and the minimum index.



In [0]:
class OneHotEncodingLayer(layers.experimental.preprocessing.PreprocessingLayer):
  def __init__(self, vocabulary=None, depth=None, minimum=None):
    super().__init__()
    self.vectorization = layers.experimental.preprocessing.TextVectorization(output_sequence_length=1)  

    if vocabulary:
      self.vectorization.set_vocabulary(vocabulary)
    self.depth = depth   
    self.minimum = minimum

  def adapt(self, data):
    self.vectorization.adapt(data)
    vocab = self.vectorization.get_vocabulary()
    self.depth = len(vocab)
    indices = [i[0] for i in self.vectorization([[v] for v in vocab]).numpy()]
    self.minimum = min(indices)

  def call(self,inputs):
    vectorized = self.vectorization.call(inputs)
    subtracted = tf.subtract(vectorized, tf.constant([self.minimum], dtype=tf.int64))
    encoded = tf.one_hot(subtracted, self.depth)
    return layers.Reshape((self.depth,))(encoded)

  def get_config(self):
    return {'vocabulary': self.vectorization.get_vocabulary(), 'depth': self.depth, 'minimum': self.minimum}

## Simple Neural Network
A simple network that takes in a single categorical input (*color*), One Hot Encodes it, then concatenates the result with a single numeric input (*id*). 

In [0]:
colors_df = pd.DataFrame(data=[[5,'yellow'],[1,'red'],[2,'blue'],[3,'green'],[4,'blue'],[7,'purple']], columns=['id', 'color'])

categorical_input = layers.Input(shape=(1,), dtype=tf.string)
one_hot_layer = OneHotEncodingLayer()
one_hot_layer.adapt(colors_df['color'].values)
encoded = one_hot_layer(categorical_input)

numeric_input = layers.Input(shape=(1,), dtype=tf.float32)

concat = layers.concatenate([numeric_input, encoded])

# Executing the Model
Since this model has not trainable weights, we don't need to call `fit()` before executing it on some inputs. In a real Model, you would likely have some additional trainable layers after the concatenation that attempt to predict some target value.

In [8]:
model = models.Model(inputs=[numeric_input, categorical_input], outputs=[concat])
model.compile()
predicted = model.predict([colors_df['id'], colors_df['color']])
print(predicted)
# [[5. 0. 1. 0. 0. 0.]
#  [1. 0. 0. 1. 0. 0.]
#  [2. 1. 0. 0. 0. 0.]
#  [3. 0. 0. 0. 0. 1.]
#  [4. 1. 0. 0. 0. 0.]
#  [7. 0. 0. 0. 1. 0.]]

[[5. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0.]
 [2. 1. 0. 0. 0. 0.]
 [3. 0. 0. 0. 0. 1.]
 [4. 1. 0. 0. 0. 0.]
 [7. 0. 0. 0. 1. 0.]]


## Saving the Model configuration as a JSON object
Since the Custom One Hot Encoding layer class defined a `get_config()` method, we can save it along with the entire model as a JSON object. This model can then be reloaded from the JSON config and used to perform inference.

In [9]:
config = model.get_config()
with tf.keras.utils.custom_object_scope({'OneHotEncodingLayer': OneHotEncodingLayer}):
  loaded_model = tf.keras.Model.from_config(config)

predicted = loaded_model.predict([colors_df['id'], colors_df['color']])
print(predicted)
# [[5. 0. 1. 0. 0. 0.]
#  [1. 0. 0. 1. 0. 0.]
#  [2. 1. 0. 0. 0. 0.]
#  [3. 0. 0. 0. 0. 1.]
#  [4. 1. 0. 0. 0. 0.]
#  [7. 0. 0. 0. 1. 0.]]

[[5. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0.]
 [2. 1. 0. 0. 0. 0.]
 [3. 0. 0. 0. 0. 1.]
 [4. 1. 0. 0. 0. 0.]
 [7. 0. 0. 0. 1. 0.]]
