In [3]:
import tensorflow as tf
import string
import re

max_features = 96
embedding_dim = 16
sequence_length = 100
def custom_standardization(input_data):
    lowercase     = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    stripped_num  = tf.strings.regex_replace(stripped_html, "[\d-]", " ")
    stripped_punc = tf.strings.regex_replace(stripped_num, 
                             "[%s]" % re.escape(string.punctuation), "")    
    return stripped_punc
    
def char_split(input_data):
  return tf.strings.unicode_split(input_data, 'UTF-8')
    
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=100,
    split=char_split, # word_split or char_split
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_dataset = tf.data.Dataset.from_tensor_slices(["Daniel Perez Efremova", "bar", "baz"])

# Now that the vocab layer has been created, call `adapt` on the text-only  
# dataset to create the vocabulary. You don't have to batch, but for large  
# datasets this means we're not keeping spare copies of the dataset.  
vectorize_layer.adapt(text_dataset.batch(64))

# Create the model that uses the vectorize text layer  
model = tf.keras.models.Sequential()

# Start by creating an explicit input layer. It needs to have a shape of  
# (1,) (because we need to guarantee that there is exactly one string  
# input per batch), and the dtype needs to be 'string'.  
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

# The first layer in our model is the vectorization layer. After this  
# layer, we have a tensor of shape (batch_size, max_len) containing vocab  
# indices.  
model.add(vectorize_layer)

# Now, the model can map strings to integers, and you can add an embedding  
# layer to map these integers to learned embeddings.  
input_data = [["Daniel Perez Efremova"], ["qux baz"]]
model.predict(input_data)




[1;35marray[0m[1m([0m[1m[[0m[1m[[0m[1;36m16[0m,  [1;36m3[0m, [1;36m11[0m, [1;36m14[0m,  [1;36m2[0m, [1;36m13[0m,  [1;36m7[0m,  [1;36m9[0m,  [1;36m2[0m,  [1;36m4[0m,  [1;36m2[0m,  [1;36m5[0m,  [1;36m7[0m,  [1;36m2[0m, [1;36m15[0m,  [1;36m4[0m,
         [1;36m2[0m, [1;36m12[0m, [1;36m10[0m,  [1;36m8[0m,  [1;36m3[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,
         [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,
         [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  [1;36m0[0m,  