In [1]:
import tensorflow as tf

import numpy as np
import os
import time

# Importing stories
path_to_file = 'stories.txt'
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Printing Outputs
print(f'Length of text: {len(text)} characters')
print(text[:250])
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

Length of text: 94504 characters
The Four Creations
The world at first was endless space in which existed only the Creator, Taiowa. This world had no time, no shape, and no life, except in the mind of the Creator. Eventually the infinite creator created the finite in Sotuknang, who
73 unique characters


In [4]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
ids = ids_from_chars(chars)
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
chars = chars_from_ids(ids)
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [5]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [6]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(94504,), dtype=int64, numpy=array([37, 49, 46, ...,  1,  2,  1], dtype=int64)>

In [7]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [8]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

T
h
e
 
F
o
u
r
 
C


In [9]:
seq_length = 100

In [10]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'T' b'h' b'e' b' ' b'F' b'o' b'u' b'r' b' ' b'C' b'r' b'e' b'a' b't'
 b'i' b'o' b'n' b's' b'\r' b'\n' b'T' b'h' b'e' b' ' b'w' b'o' b'r' b'l'
 b'd' b' ' b'a' b't' b' ' b'f' b'i' b'r' b's' b't' b' ' b'w' b'a' b's'
 b' ' b'e' b'n' b'd' b'l' b'e' b's' b's' b' ' b's' b'p' b'a' b'c' b'e'
 b' ' b'i' b'n' b' ' b'w' b'h' b'i' b'c' b'h' b' ' b'e' b'x' b'i' b's'
 b't' b'e' b'd' b' ' b'o' b'n' b'l' b'y' b' ' b't' b'h' b'e' b' ' b'C'
 b'r' b'e' b'a' b't' b'o' b'r' b',' b' ' b'T' b'a' b'i' b'o' b'w' b'a'
 b'.' b' ' b'T'], shape=(101,), dtype=string)


In [11]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'The Four Creations\r\nThe world at first was endless space in which existed only the Creator, Taiowa. T'
b'his world had no time, no shape, and no life, except in the mind of the Creator. Eventually the infin'
b'ite creator created the finite in Sotuknang, whom he called his nephew and whom he created as his age'
b'nt to establish nine universes. Sotuknang gathered together matter from the endless space to make the'
b' nine solid worlds. Then the Creator instructed him to gather together the waters from the endless sp'


In [12]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [13]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [14]:
dataset = sequences.map(split_input_target)

In [15]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'The Four Creations\r\nThe world at first was endless space in which existed only the Creator, Taiowa. '
Target: b'he Four Creations\r\nThe world at first was endless space in which existed only the Creator, Taiowa. T'


In [16]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [33]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())
# The embedding dimension
embedding_dim = 256
# Number of RNN units
rnn_units = 2048


class CustomTraining(MyModel):
  @tf.function
  def train_step(self, inputs):
      inputs, labels = inputs
      with tf.GradientTape() as tape:
          predictions = self(inputs, training=True)
          loss = self.loss(labels, predictions)
      grads = tape.gradient(loss, model.trainable_variables)
      self.optimizer.apply_gradients(zip(grads, model.trainable_variables))

      return {'loss': loss}

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    x, states = self.gru(x, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

model = CustomTraining(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [34]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")



(64, 100, 74) # (batch_size, sequence_length, vocab_size)


In [35]:
model.summary()

In [36]:
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [41]:
model.fit(dataset, epochs=100)

Epoch 1/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2s/step - loss: 1.6848
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2s/step - loss: 1.6236
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2s/step - loss: 1.5673
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2s/step - loss: 1.5051
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2s/step - loss: 1.4481
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2s/step - loss: 1.3865
Epoch 7/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2s/step - loss: 1.3308
Epoch 8/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2s/step - loss: 1.2649
Epoch 9/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2s/step - loss: 1.2063
Epoch 10/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 2s/step - loss: 1.1498

<keras.src.callbacks.history.History at 0x1cb3f982ad0>

In [64]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.2):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [65]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [66]:
start = time.time()
states = None
next_char = tf.constant(['Before man...'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Before man...
Malepid. Fort, bro antorngrsclats Maizeoo weveaneatheaw de cavevigrowevend ldy tizenigre d st wee taproug anghe fes. sowhoo; adEpothororonapunarks bougr ts thetchey ste r ingame the amad therapupathothin s chtho asema am d s apio anam
wan theiors h, booro taulond top GN
no Cre, sK3-mus bone revean he, ngrathaditisece ly. an-coorta Fremave k llaigetor cowh by suler soohefug ss Spe, the f y gad Sto ongeto Dratind her Mamedrogran t an TOpla.
Izeand. n seve banofemorghon his s g ais Decedere t istheinda he ganthe wende ap tsid. tizeses hlld ndreistheve f nd d, he, Hatid alotraveapAghe tothinot herak-pant tsif tithess. fftolaw bey nd, ged rspld
waizeiranwem
Af tosped Tatr ano s by. t icaverscisounith to, h ans Scepanirs beizladrecepy thighe hil thif Thilaplan d If t Mordrucla hthow. Marsuntsmathe g angumanissinof tist alatopire che talamat tidigrdrd ksoumioun t e, gendave t, thepr shesmad.
 he cis owa.Sptuf he swead tooubow andle scowed.
 Thestizy wevethentheche y. angreishema