# How to the TextVectorization Layer in TensorFLow

This video walks you through how to use the TextVectorization layer in TensorFlow

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [2]:
# Instanting
text_vectorization = TextVectorization()

2024-05-11 13:04:15.575246: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
data = [
    "Bugun gunlerden cumartesi",
    "Erdem, Ali, Ahmet ve Mehmet Parka gidecek",
    "Aksam geri gelecekler"
    "Yatip uyuyacaklar"
]

In [4]:
# Creating the vocabulary with the adapt method.
text_vectorization.adapt(data)

In [5]:
# Let's take a look at the vocabulary.
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 've',
 'uyuyacaklar',
 'parka',
 'mehmet',
 'gunlerden',
 'gidecek',
 'geri',
 'gelecekleryatip',
 'erdem',
 'cumartesi',
 'bugun',
 'ali',
 'aksam',
 'ahmet']

In [6]:
# Data preprocessing with the layer
vectorized_text = text_vectorization(data)
vectorized_text

<tf.Tensor: shape=(3, 7), dtype=int64, numpy=
array([[12,  6, 11,  0,  0,  0,  0],
       [10, 13, 15,  2,  5,  4,  7],
       [14,  8,  9,  3,  0,  0,  0]])>

# Using the custom functions TextVectorization

In [7]:
import re
import string

In [8]:
def standardization_fn(string_tensor):
  lowercase=tf.strings.lower(string_tensor)
  return tf.strings.regex_replace(
      lowercase, f"[{re.escape(string.punctuation)}]", ""
  )

In [9]:
def split_fn(string_tensor):
  return tf.strings.split(string_tensor)

In [10]:
text_vectorization = TextVectorization(
    standardize=standardization_fn,
    split = split_fn
)

In [11]:
text_vectorization.adapt(data)

In [12]:
# Testing our layer with a text
text = "bugun parka gidecek"
text_vectorization(text)

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([12,  4,  7])>

# Using TextVectorization in a model

In [13]:
# Creating a Dataset object
text_dataset = tf.data.Dataset.from_tensor_slices([
    "trabzonspor", "real_madrid", "alabama"
])

In [14]:
# Creating the TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=5000,
    output_sequence_length=4
)

In [15]:
# Creating the vocabulary
vectorize_layer.adapt(text_dataset.batch(64))

In [16]:
vectorize_layer.get_vocabulary()

['', '[UNK]', 'trabzonspor', 'realmadrid', 'alabama']

In [17]:
# Building the model
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string),
    vectorize_layer
])

In [18]:
# Getting a data for testing
input_data=[["sampiyon trabzonspor"], ["winner real_madrid"], ["Sweet home alabama"]]

In [19]:
model.predict(input_data)



array([[1, 2, 0, 0],
       [1, 3, 0, 0],
       [1, 1, 4, 0]])