In [27]:
!pip install -q tensorflow-datasets tensorflow

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [28]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os

os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"


In [None]:
# whole_text_dataset = tf.data.Dataset.from_tensor_slices(
#     [
#         "oggi vado al cinema bello",
#         "domani vado in piscina olimpica",
#         "e tardi vado e torno",
#     ]
# )

In [None]:
# text = ""
# with open("./notebooks/il-piccolo-principe.txt", "r") as file:
#     text = file.read()
# whole_text_dataset = tf.data.Dataset.from_tensor_slices(text.split("\n"))

In [None]:
# whole_text_dataset = tfds.load('wikipedia', split='train').map(lambda x: x['text']).take(100)

In [29]:

SEQUENCE_WINDOW_SIZE = 64

text_vectorizer = tf.keras.layers.TextVectorization(
    output_mode="int",
    max_tokens=80000,  # number of english head words
    standardize="lower_and_strip_punctuation",
)

text_vectorizer.adapt(whole_text_dataset)

vocabulary_size = text_vectorizer.vocabulary_size() + 1
END_OF_SEQUENCE_TOKEN_INDEX = text_vectorizer.vocabulary_size()

whole_text_tokenized_dataset = whole_text_dataset.map(text_vectorizer)

def truncate_and_pad_sequence(sequence):
    if len(sequence) > SEQUENCE_WINDOW_SIZE:
        sequence = sequence[-SEQUENCE_WINDOW_SIZE:]
    if len(sequence) < SEQUENCE_WINDOW_SIZE:
        sequence = tf.pad(sequence, [[SEQUENCE_WINDOW_SIZE - len(sequence), 0]])
    return sequence


def generate_training_samples():
    for whole_text_token_indices in whole_text_tokenized_dataset:
        for i in range(len(whole_text_token_indices)):
            yield (
                truncate_and_pad_sequence(whole_text_token_indices[:i]),
                whole_text_token_indices[i],
            )
        yield (
            truncate_and_pad_sequence(whole_text_token_indices),
            END_OF_SEQUENCE_TOKEN_INDEX,
        )


training_dataset = tf.data.Dataset.from_generator(
    generate_training_samples,
    output_signature=(
        tf.TensorSpec(shape=(SEQUENCE_WINDOW_SIZE,), dtype=tf.int64),
        tf.TensorSpec(shape=(), dtype=tf.int64),
    ),
)

input_layer = tf.keras.Input(shape=(SEQUENCE_WINDOW_SIZE,), dtype=tf.int64)

word_embedding_layer = tf.keras.layers.Embedding(
    input_dim=vocabulary_size,
    output_dim=31,  # smallest common word embedding dimensionality
)


def append_index(inputs):
    return tf.concat(
        [
            inputs,
            tf.tile(
                (
                    tf.range(SEQUENCE_WINDOW_SIZE, dtype=tf.float32)
                    / SEQUENCE_WINDOW_SIZE
                )[tf.newaxis, :, tf.newaxis],
                [tf.shape(inputs)[0], 1, 1],
            ),
        ],
        axis=-1,
    )


def attention_layer(input):
    query = processor_layer(input)
    key = processor_layer(input)
    value = processor_layer(input)
    return tf.keras.layers.Attention()([query, key, value])


def processor_layer(inputs, units=16, levels=8):
    layer = inputs
    for _ in range(levels):
        layer = tf.concat(
            [layer, tf.keras.layers.Dense(units, activation=tf.nn.relu)(layer)], axis=-1
        )
    return layer


token_selector_layer = tf.keras.layers.Dense(vocabulary_size, activation=tf.nn.softmax)

layer = input_layer
layer = word_embedding_layer(layer)
layer = append_index(layer)
layer = attention_layer(layer)
layer = attention_layer(layer)
layer = layer[..., -1, :]
layer = processor_layer(layer)
layer = processor_layer(layer)
layer = token_selector_layer(layer)

output_layer = layer


model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

model.summary()

history = model.fit(
    training_dataset.prefetch(tf.data.AUTOTUNE).batch(64),
    epochs=10,
)

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 embedding_8 (Embedding)     (None, 64, 31)               369892    ['input_9[0][0]']             
                                                                                                  
 tf.compat.v1.shape_8 (TFOp  (3,)                         0         ['embedding_8[0][0]']         
 Lambda)                                                                                          
                                                                                                  
 tf.__operators__.getitem_1  ()                           0         ['tf.compat.v1.shape_8[0

2024-02-21 16:38:38.276643: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
def text_completion(input_text, max_inferred_tokens=10):
    input_token_indices = text_vectorizer(input_text)
    output_token_indices = tf.constant([], dtype=tf.int64)
    while len(output_token_indices) < max_inferred_tokens:
        model_input = truncate_and_pad_sequence(
            tf.concat([input_token_indices, output_token_indices], axis=0)
        )
        token_indices_probabilities = model.predict(model_input[tf.newaxis, :])[0]
        next_token_index = tf.argmax(token_indices_probabilities)
        if next_token_index == END_OF_SEQUENCE_TOKEN_INDEX:
            break
        output_token_indices = tf.concat(
            [output_token_indices, [next_token_index]], axis=0
        )
    output_text = " ".join([text_vectorizer.get_vocabulary()[token_index] for token_index in output_token_indices])
    return (input_text, output_text, len(output_token_indices))

In [31]:
print(text_completion("american"))

('american', 'the the the the the the the the the the', 10)
