# **Categorical features preprocessing**

In [28]:
import tensorflow as tf

**tf.keras.layers.CategoryEncoding** : this layer helps you convert categorical data into a numerical format that can be used in machine learning models.

In [29]:
categories = tf.constant([0, 1, 2, 0, 1, 2])

In [30]:
encoding_layer = tf.keras.layers.CategoryEncoding(num_tokens=3, output_mode="one_hot")
#num_tokens=3: Specifies that there are 3 unique categories (0, 1, and 2).
#output_mode="one_hot": Converts the categories into a one-hot encoded format,
#where each category is represented as a binary vector with a single "1" and the rest "0"s.

In [31]:
encoded_data = encoding_layer(categories)

In [32]:
encoded_data.numpy()

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

In [56]:
categories2 = tf.constant([1,2,4,1,4,2,1,3,4,3])

In [63]:
encoding_layer2= tf.keras.layers.CategoryEncoding(num_tokens=4 , output_mode="one_hot")

In [64]:
encoded_data2 = encoding_layer2(categories2)

In [65]:
encoded_data2.numpy()

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [0., 0., 0., 1.]], dtype=float32)

**tf.keras.layers.Hashing** : Hashing is used to map categorical data to an integer within a fixed range using a hash function.

In [33]:
categories = tf.constant(["apple", "banana", "cherry", "apple", "banana", "cherry"])

In [34]:
hashing_layer = tf.keras.layers.Hashing(num_bins=3)

In [35]:
hashed_data = hashing_layer(categories)

In [36]:
hashed_data.numpy()

array([1, 0, 1, 1, 0, 1])

In [66]:
categories3 = tf.constant(['A', 'B', 'C', 'B', 'B', 'A', 'C'])

In [71]:
hashing_layer2= tf.keras.layers.Hashing(num_bins= 4)

In [72]:
hashed_data2 = hashing_layer2(categories3)

In [73]:
hashed_data2.numpy()

array([0, 0, 3, 0, 0, 0, 3])

**tf.keras.layers.StringLookup** : StringLookup is used to map a set of string values to integer indices, which is essential for converting string categorical data into a numerical format that models can understand.

In [37]:
lookup_layer = tf.keras.layers.StringLookup()

In [38]:
lookup_layer.adapt(categories)

In [39]:
encoded_data = lookup_layer(categories)

In [40]:
encoded_data.numpy()

array([3, 2, 1, 3, 2, 1])

In [74]:
categories3 = tf.constant(['A', 'B', 'C', 'B', 'B', 'A', 'C'])

In [75]:
lookup_layer2 = tf.keras.layers.StringLookup()

In [79]:
lookup_layer2.adapt(categories3)

In [80]:
encoded_data3 = lookup_layer2(categories3)

In [84]:
encoded_data3.numpy()

array([3, 1, 2, 1, 1, 3, 2])

In [41]:
#adapt is used to build the vocabulary from the input data.

**tf.keras.layers.IntegerLookup** : IntegerLookup is similar to StringLookup, but it works with integer categorical data. It maps integers to a range of indices, which can be useful when you have integer categories that arenâ€™t sequential or start from zero.

In [42]:
categories = tf.constant([100, 100, 102, 100, 101, 102])

In [43]:
integer_lookup_layer = tf.keras.layers.IntegerLookup()

In [44]:
integer_lookup_layer.adapt(categories)

In [45]:
encoded_data = integer_lookup_layer(categories)

In [46]:
encoded_data.numpy()

array([1, 1, 2, 1, 3, 2])

In [85]:
categories4 = tf.constant([55,33,88,44,22,33,55,88])

In [86]:
integer_lookup_layer2 = tf.keras.layers.IntegerLookup()

In [87]:
integer_lookup_layer2.adapt(categories4)

In [88]:
encoded_data4 = integer_lookup_layer2(categories4)

In [89]:
encoded_data4.numpy()

array([2, 3, 1, 4, 5, 3, 2, 1])