In [15]:
# predict next word based on last one

In [16]:
!pip install -q tensorflow-datasets tensorflow

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [17]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [18]:
small_dataset = tf.data.Dataset.from_tensor_slices(["primo secondo terzo quarto quinto sesto settimo ottavo nono decimo"])

for element in small_dataset.take(1):
    print(element)

tf.Tensor(b'primo secondo terzo quarto quinto sesto settimo ottavo nono decimo', shape=(), dtype=string)


In [19]:
import re

file_text_content = ""
with open("./notebooks/il-piccolo-principe.txt", "r") as file:
    file_text_content = file.read()
file_text_content = re.sub(r'[^a-zA-ZàéòùìèÈ]', ' ', file_text_content)
medium_dataset = tf.data.Dataset.from_tensor_slices([file_text_content])

for element in medium_dataset.take(1):
    print(element)

tf.Tensor(b'Il piccolo principe   Antoine de Saint Exup\xc3\xa9ry    Traduttore  Franco Perini       Pubblicato        Categoria e   Narrativa  Narrativa per ragazzi  Fantastico  Fonte  Wikisource      Riguardo a de Saint Exup\xc3\xa9ry   Antoine Jean Baptiste Marie Roger de Saint Exup\xc3\xa9ry  meglio conosciuto come Antoine de Saint Exup\xc3\xa9ry  \xc3\xa8 stato uno scrittore e aviatore francese  \xc3\x88 conosciuto nel mondo per essere stato l autore del famoso romanzo Il piccolo principe  tradotto in     lingue  ma anche per i suoi racconti sul mondo dei primi voli aerei  tra i quali Volo di notte  Terra degli uomini e L aviatore  Scrittore riconosciuto  vinse vari premi letterari durante la sua vita  in Francia come all estero  Durante la seconda guerra mondiale si arruol\xc3\xb2 nell aeronautica militare francese e dopo l armistizio nelle Forces a\xc3\xa9riennes fran aises libres  dalla parte degli Alleati  La sua morte in volo  avvenuta sul finire della guerra  rest\xc3\xb2 pe

In [20]:
large_dataset = tfds.load('wikipedia', split='train').map(lambda x: x['text']).take(200)

for element in large_dataset.take(1):
    print(element)

tf.Tensor(b'Joseph Harold Greenberg (May 28, 1915 \xe2\x80\x93 May 7, 2001) was an American linguist, known mainly for his work concerning linguistic typology and the genetic classification of languages.\n\nLife\n\nEarly life and education\nJoseph Greenberg was born on May 28, 1915, to Jewish parents in Brooklyn, New York. His first great interest was music. At the age of 14, he gave a piano concert in Steinway Hall. He continued to play the piano frequently throughout his life.\n\nAfter graduating from James Madison High School, he decided to pursue a scholarly career rather than a musical one. He enrolled at Columbia College in New York in 1932. During his senior year, he attended a class taught by Franz Boas concerning American Indian languages. He graduated in 1936 with a bachelor\'s degree. With references from Boas and Ruth Benedict, he was accepted as a graduate student by Melville J. Herskovits at Northwestern University in Chicago and graduated in 1940 with a doctorate degree.

In [21]:
dataset = large_dataset

In [22]:
# maybe use subword tokenizer https://www.tensorflow.org/text/guide/subwords_tokenizer

text_vectorizer = tf.keras.layers.TextVectorization(
    output_mode="int",
    standardize="lower_and_strip_punctuation",
)

text_vectorizer.adapt(dataset)

END_OF_SEQUENCE_TOKEN_INDEX = text_vectorizer.vocabulary_size()
text_vectorizer.set_vocabulary(text_vectorizer.get_vocabulary() + ["[END_OF_SEQUENCE]"])
vocabulary_size = text_vectorizer.vocabulary_size() + 1

print(text_vectorizer.get_vocabulary())

In [None]:
SEQUENCE_WINDOW_SIZE = 32

In [None]:
def truncate_and_pad_sequence(sequence):
    assert len(sequence.shape) == 1
    sequence = sequence[-SEQUENCE_WINDOW_SIZE:]
    sequence = tf.pad(sequence, [[max(0, SEQUENCE_WINDOW_SIZE - sequence.shape[0]), 0]])
    return sequence

In [None]:
def generate_training_samples():
    for whole_text_token_indices in dataset.map(text_vectorizer):
        for i in range(len(whole_text_token_indices)):
            yield (
                truncate_and_pad_sequence(whole_text_token_indices[:i]),
                whole_text_token_indices[i],
            ) 
        yield (
            truncate_and_pad_sequence(whole_text_token_indices),
            END_OF_SEQUENCE_TOKEN_INDEX,
        )


training_dataset = tf.data.Dataset.from_generator(
    generate_training_samples,
    output_signature=(
        tf.TensorSpec(shape=(SEQUENCE_WINDOW_SIZE,), dtype=tf.int64),
        tf.TensorSpec(shape=(), dtype=tf.int64),
    ),
).cache()

for (input, output) in training_dataset.take(50):
    print(([text_vectorizer.get_vocabulary()[token_index] for token_index in input], text_vectorizer.get_vocabulary()[output]))

In [None]:
input_layer = tf.keras.Input(shape=(SEQUENCE_WINDOW_SIZE), dtype=tf.int64)

word_embedding_layer = tf.keras.layers.Embedding(
    input_dim=vocabulary_size,
    output_dim=31,  # smallest common word embedding dimensionality
)

def positional_encoding_layer(inputs):
  return tf.concat(
    [
        inputs,
        tf.tile(
            (tf.range(SEQUENCE_WINDOW_SIZE, dtype=tf.float32) / SEQUENCE_WINDOW_SIZE)[
                tf.newaxis, :, tf.newaxis
            ],
            multiples=[tf.shape(layer)[0], 1, 1],
        ),
    ],
    axis=-1,
)

def processing_layer(layer, levels = 4):
    for _ in range(levels):
        layer = tf.concat([layer, tf.keras.layers.Dense(32, activation=tf.nn.relu)(layer)], axis=-1)
    return layer

def attention_layer(layer):
    return tf.reduce_mean(processing_layer(layer), axis=-2)

token_selector_layer = tf.keras.layers.Dense(vocabulary_size, activation=tf.nn.softmax)

layer = input_layer
layer = word_embedding_layer(layer)
last = layer[..., -1, :]
layer = positional_encoding_layer(layer)
layer = attention_layer(layer)
layer = tf.concat([layer, last], axis=-1)
layer = processing_layer(layer)
layer = token_selector_layer(layer)

output_layer = layer

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

model.summary()

history = model.fit(
    training_dataset.prefetch(tf.data.AUTOTUNE).batch(64),
    epochs=10,
)

In [None]:
def text_completion(input_text, max_inferred_tokens=50):
    input_token_indices = text_vectorizer(input_text)
    output_token_indices = tf.constant([], dtype=tf.int64)
    while len(output_token_indices) < max_inferred_tokens:
        model_input = truncate_and_pad_sequence(
            tf.concat([input_token_indices, output_token_indices], axis=0)
        )
        token_indices_probabilities = model.predict(model_input[tf.newaxis, :])[0]
        next_token_index = tf.argmax(token_indices_probabilities)
        if next_token_index == END_OF_SEQUENCE_TOKEN_INDEX:
            break
        output_token_indices = tf.concat(
            [output_token_indices, [next_token_index]], axis=0
        )
    output_text = " ".join([text_vectorizer.get_vocabulary()[token_index] for token_index in output_token_indices])
    return (input_text, output_text, len(output_token_indices))

In [None]:
print(text_completion("she believed"))