<a href="https://colab.research.google.com/github/dominiksakic/generative_ai/blob/main/text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Download data
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2025-06-01 12:09:56--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-06-01 12:10:09 (6.54 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization

# Prepare data
dataset = keras.utils.text_dataset_from_directory(
    directory="aclImdb", label_mode=None, batch_size=256)
dataset = dataset.map(lambda x: tf.strings.regex_replace(x, "<br />", " "))

sequence_length = 100
vocab_size = 15000
text_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_vectorization.adapt(dataset)

Found 100006 files.


In [None]:
# Set up language dataset
def prepare_lm_dataset(text_batch):
  vectorized_sequences = text_vectorization(text_batch)
  x = vectorized_sequences[:, :-1]
  y = vectorized_sequences[:, 1:]
  return x, y

lm_dataset = dataset.map(prepare_lm_dataset, num_parallel_calls=4)

In [None]:
from tensorflow.keras import layers

# Transformer
class TransformerDecoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.supports_masking = True
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention_1 = layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_dim
        )

    self.attention_2 = layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_dim
        )
    self.dense_proj = keras.Sequential(
        [layers.Dense(dense_dim, activation="relu"),layers.Dense(embed_dim),]
    )
    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()
    self.layernorm_3 = layers.LayerNormalization()


  def get_config(self):
    config = super().get_config()
    config.update({
      "embed_dim": self.embed_dim,
      "num_heads": self.num_heads,
      "dense_dim": self.dense_dim,
    })
    return config

  def get_causal_attention_mask(self, inputs):
    input_shape = tf.shape(inputs)
    batch_size, sequence_length = input_shape[0], input_shape[1]
    i = tf.range(sequence_length)[:, tf.newaxis]
    j = tf.range(sequence_length)
    mask = tf.cast(i >= j, dtype="int32")
    mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1),
         tf.constant([1, 1], dtype=tf.int32)], axis=0)
    return tf.tile(mask, mult)

  def call(self, inputs, encoder_outputs, mask=None):
    casual_mask = self.get_causal_attention_mask(inputs)
    if mask is not None:
      padding_mask = tf.cast(
          mask[:, tf.newaxis, :], dtype="int32")
      padding_mask = tf.minimum(padding_mask, casual_mask)
    else:
      padding_mask = mask
    attention_output_1 = self.attention_1(
        query=inputs,
        value=inputs,
        key=inputs,
        attention_mask=casual_mask
    )
    attention_output_1 = self.layernorm_1(inputs + attention_output_1)
    attention_output_2 = self.attention_2(
        query=attention_output_1,
        value=encoder_outputs,
        key=encoder_outputs,
        attention_mask=padding_mask)
    attention_output_2 = self.layernorm_2(
        attention_output_1 + attention_output_2)
    proj_output = self.dense_proj(attention_output_2)
    return self.layernorm_3(attention_output_2 + proj_output)

class PositionalEmbedding(layers.Layer):
  def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
    super().__init__(**kwargs)
    self.supports_masking = True
    self.token_embeddings = layers.Embedding(input_dim=input_dim,
                                             output_dim=output_dim,
                                             mask_zero=True)
    self.position_embeddings = layers.Embedding(input_dim=sequence_length,
                                                output_dim=output_dim)
    self.sequence_length = sequence_length
    self.input_dim = input_dim
    self.output_dim = output_dim

  def compute_mask(self, inputs, mask=None):
    return self.token_embeddings.compute_mask(inputs)

  def call(self, inputs):
    length = tf.shape(inputs)[-1]
    positions = tf.range(start=0, limit=length, delta=1)
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions = self.position_embeddings(positions)
    return embedded_tokens + embedded_positions

  def get_config(self):
    config = super().get_config()
    config.update({
        "output_dim": self.output_dim,
        "sequence_length": self.sequence_length,
        "input_dim": self.input_dim,})
    return config

In [None]:
# Transformer approach
embed_dim = 256
latent_dim = 2048
num_heads = 2

inputs = keras.Input(shape=(None,), dtype="int64")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, x)
outputs = layers.Dense(vocab_size, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop")

In [None]:
import numpy as np
from tensorflow import keras

# Text generation callback
"""
Callback to generate range of text using different temperatures after every epoch
Goal: Observe the evolution of the generated text as the  model begins converge.

Method: Seed text "this movie"
"""
tokens_index = dict(enumerate(text_vectorization.get_vocabulary()))


def sample_next(predictions, temperature=1.0):
  """
  Implements variable - temperature sampling form a probability distribution.
  """
  predictions = np.asarray(predictions).astype("float64")
  predictions = np.log(predictions) / temperature
  exp_preds = np.exp(predictions)
  predictions = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, predictions, 1)
  return np.argmax(probas)

class TextGenerator(keras.callbacks.Callback):
  def __init__(
      self,
      prompt, # Prompt to seed
      generate_length, # How many words to generate
      model_input_length,
      temperatures=(1.,), # Range of temp to use for sampling
      print_freq=1):
    self.prompt = prompt
    self.generate_length = generate_length
    self.model_input_length = model_input_length
    self.temperatures = temperatures
    self.print_freq = print_freq
    # Compute length of tokenized input. To offset when sampling next token.
    # What shape is this here? What is happening?????
    vectorized_prompt = text_vectorization([prompt])[0].numpy()
    self.prompt_length = np.nonzero(vectorized_prompt == 0)[0][0]

  def on_epoch_end(self, epoch, logs=None):
    if (epoch + 1) % self.print_freq != 0:
      return
    for temperature in self.temperatures:
      print("== Generating with temperature", temperature)
      # Start from prompt
      sentence = self.prompt
      for i in range(self.generate_length):
        tokenized_sentence = text_vectorization([sentence])
        # Feed current sequence into model
        predictions = self.model(tokenized_sentence)
        # Retrieve the pred for the last timestep and use them to sample new word
        next_token = sample_next(
            predictions[0, self.prompt_length - 1 + i, :]
        )
        sampled_token = tokens_index[next_token]
        # Append the new word to the current sequence
        sentence += " " + sampled_token
      print(sentence)

prompt = "This movie"
text_gen_callback = TextGenerator(
    prompt,
    generate_length=50,
    model_input_length=sequence_length,
    temperatures=(0.2, 0.5, 0.7, 1., 1.5))

In [None]:
# Fit the model
model.fit(lm_dataset, epochs=200, callbacks=[text_gen_callback])

Epoch 1/200
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 387ms/step - loss: 6.3879== Generating with temperature 0.2
This movie is pretty weak plot nicely from the rip off theres a has cute movie right sniper way to possibly an interesting to figure towards the actor as money a band foolish life day is panther not sure the same [UNK] in the elevator and his parents theirs and all i
== Generating with temperature 0.5
This movie is the cinema kills a series character was mostly entirely around him this is one hayek tragic and his [UNK] they are good style they might have blame her mark in pamela loose but this list with those chanting aside from the devil series having to be wonder what properly
== Generating with temperature 0.7
This movie has to save this however is one conventions rash very candidate which pearls tragic film wasnt quaint rubbish the story however at all great reason why didnt werent like the bad theaters i suspect something else on cable and this 