# Text generator

<a href="https://colab.research.google.com/github/fmcooper/text-generator/blob/master/TextGenerator.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

---
Based on https://www.tensorflow.org/tutorials/sequences/text_generation

In [0]:
# !pip install tensorflow-gpu==2.0.0-alpha0
# !pip install --upgrade numpy
# !pip install --upgrade matplotlib

In [0]:
import sys
import os
import math
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds

print("\n---------- versions ----------\n")
print("python version: " + sys.version)
print("numpy version: " + np.__version__)
print("matplotlib version: " + mpl.__version__)
print("tensorflow version: " + tf.__version__)
print()

tf.enable_eager_execution()

from google.colab import drive
drive.mount('/content/gdrive')

NUM_TESTING = 10000
TESTING = False
EMBEDDING_DIMS = 256
RNN_UNITS = 1024
NUM_EPOCHS = 3
BATCH_SIZE = 64
BUFFER_SIZE = 10000
CHECKPOINT_DIR = F'/content/gdrive/My Drive/Colab/text-generator/checkpoints/' + "epochs" + str(NUM_EPOCHS) + "_batchsize" + str(BATCH_SIZE) + "_embeddingdims" + str(EMBEDDING_DIMS) + "_rnnunits" + str(RNN_UNITS) + "/"     # directory checkpoint weights of model are saved
ENTIRE_SAVE_PATH = CHECKPOINT_DIR + 'trained_entire_model.h5'   # file where entire model is saved


---------- versions ----------

python version: 3.6.7 (default, Oct 22 2018, 11:32:17) 
[GCC 8.2.0]
numpy version: 1.14.6
matplotlib version: 3.0.3
tensorflow version: 1.13.1

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Downloading the data

In [0]:
print("\n---------- downloading data using keras ----------\n")
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')


---------- downloading data using keras ----------



### Exploring the data

In [0]:
print("\n---------- exploring data ----------\n")
# decoding the data
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
if TESTING:
    text = text[0:NUM_TESTING]
# number of characters in the text
print('Total number characters: ' + str(len(text)))
print('First 100 characters: ')
print(text[:100])
vocab = sorted(set(text))
print('Number of unique characters in text: ' + str(len(vocab)))
print('Unique characters in text: ' + str(vocab))
vocab_size = len(vocab)


---------- exploring data ----------

Total number characters: 1115394
First 100 characters: 
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
Number of unique characters in text: 65
Unique characters in text: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


### Preparing data

Creating maps between integers and text.

In [0]:
print("\n---------- preparing data ----------\n")
# mappings between indices and unique characters
indicesToChars = np.array(vocab)
charsToIndices = {}
for i, char in enumerate(vocab):
    charsToIndices[char] = i
print("character at index 20: " + str(indicesToChars[20]))
print("character H has index: " + str(charsToIndices['H']))

# save the text as integers
textAsIntegers = np.array([charsToIndices[char] for char in text])
print("First 13 characters: " + text[:13])
print("First 13 characters as integers: " + str(textAsIntegers[:13]))


---------- preparing data ----------

character at index 20: H
character H has index: 20
First 13 characters: First Citizen
First 13 characters as integers: [18 47 56 57 58  1 15 47 58 47 64 43 52]


Creating training examples and targets.

In [0]:
print("\n---------- creating training examples and targets ----------\n")
# The input to the model will be a sequence of characters, and we train the model to predict 
# the output—the following character at each time step.

# We are going to break the text into chunks of length x (input) and then predict the next character (output). 
# Our output will be the same length as the input, so as output we also include the last x characters 
# of the input.
# E.g. for x=8, the input sequence is "Harry Po" and the output sequence is "arry Pot".

# changing the data to a stream of characters
char_dataset = tf.data.Dataset.from_tensor_slices(textAsIntegers)
print("First 5 characters after slicing the data: ")
for i in char_dataset.take(5):
  print(indicesToChars[i.numpy()])

# creating the input
def int_seq_to_string(item):
    seq = ''
    for char in item:
        seq = seq + indicesToChars[char.numpy()]
    return seq

seq_length = 100
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)   # length 101 for 100 input and 1 output
print("First 5 sequences of input: ")
for item in sequences.take(5):
    print(int_seq_to_string(item).encode(encoding='utf-8'))

# creating the targets
def createTarget(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

# inputs and outputs
dataset = sequences.map(createTarget)
for input_example, target_example in  dataset.take(1):
    print("Example input and corresponding target: ")
    print ('Input data: ' + str(int_seq_to_string(input_example).encode(encoding='utf-8')))
    print ('Target data: ' + str(int_seq_to_string(target_example).encode(encoding='utf-8')))

    # Each index of these vectors are processed as one time step. For the input at time step 0, 
    # the model receives the index for "F" and trys to predict the index for "i" as the next character. 
    # At the next timestep, it does the same thing but the RNN considers the previous step context in 
    # addition to the current input character.
    for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
        print("Step {:4d}".format(i))
        print("  input: {} ({:s})".format(input_idx, repr(indicesToChars[input_idx])))
        print("  expected output: {} ({:s})".format(target_idx, repr(indicesToChars[target_idx])))


---------- creating training examples and targets ----------

First 5 characters after slicing the data: 
F
i
r
s
t
First 5 sequences of input: 
b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'
Example input and corresponding target: 
Input data: b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
Step    0
  

Creating training batches.

In [0]:
# creating training batches
examples_per_epoch = len(text)
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print("dataset: " + str(dataset))
print("BATCH_SIZE: " + str(BATCH_SIZE))

dataset: <DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>
BATCH_SIZE: 64


### Building the model

In [0]:
print("\n---------- building the model ----------\n")
if tf.test.is_gpu_available():
  rnn = tf.keras.layers.CuDNNGRU
else:
  import functools
  rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

# building the model
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
    rnn(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform', stateful=True),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(
  vocab_size = len(vocab), 
  embedding_dim=EMBEDDING_DIMS, 
  rnn_units=RNN_UNITS,
  batch_size=BATCH_SIZE)

for input_example_batch, target_example_batch in dataset.take(1): 
  example_batch_predictions = model(input_example_batch)
  print("example_batch_predictions.shape: " + str(example_batch_predictions.shape) + "# (batch_size, sequence_length, vocab_size)")

  model.summary()


print("\n---------- test the model ----------\n")
# To get actual predictions from the model we need to sample from the output distribution, 
# to get actual character indices. This distribution is defined by the logits over the character vocabulary.

# Note: It is important to sample from this distribution as taking the argmax of the distribution 
# can easily get the model stuck in a loop.

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1)
print(sampled_indices)
print("Input: " + str(int_seq_to_string(input_example_batch[0]).encode(encoding='utf-8')))
print("Next Char Predictions: " + str(int_seq_to_string(sampled_indices).encode(encoding='utf-8')))


print("\n---------- optimiser and loss function ----------\n")
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)") 
print("scalar_loss:      ", example_batch_loss.numpy().mean())


print("\n---------- compile the model ----------\n")
model.compile(
    optimizer='adam',
    loss = loss)


---------- building the model ----------

example_batch_predictions.shape: (64, 100, 65)# (batch_size, sequence_length, vocab_size)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 256)           16640     
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (64, None, 1024)          3938304   
_________________________________________________________________
dense_2 (Dense)              (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________

---------- test the model ----------

tf.Tensor(
[ 8 51 33 27 14  7  9 20  6 34 39 22 47 55 54 35 44 29 23  0 52  3 17 30
 58 46 18 59 34 36 15 23 53 51 41 29 25 11 29 27  6 20 56  1 14 51 40  0
 40 43 42 42 17 46 26 28 17 38  9 49 39 25 35 53 36 57 59 59 42 25 

### Training the model

In [28]:
print("\n---------- training the model ----------\n")

# checkpoints
checkpoint_prefix = os.path.join(CHECKPOINT_DIR, "ckpt_{epoch}")
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)


history = model.fit(dataset.repeat(), epochs=NUM_EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])




---------- training the model ----------

Epoch 1/3


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "



Consider using a TensorFlow optimizer from `tf.train`.


W0325 18:12:35.400642 140143457597312 network.py:1430] This model was compiled with a Keras optimizer (<tensorflow.python.keras.optimizers.Adam object at 0x7f751ffc4710>) but is being saved in TensorFlow format with `save_weights`. The model's weights will be saved, but unlike with TensorFlow optimizers in the TensorFlow format the optimizer's state will not be saved.

Consider using a TensorFlow optimizer from `tf.train`.


Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.


W0325 18:12:36.537815 140143457597312 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/network.py:1436: update_checkpoint_state (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.


Epoch 2/3

Consider using a TensorFlow optimizer from `tf.train`.


W0325 18:54:26.509651 140143457597312 network.py:1430] This model was compiled with a Keras optimizer (<tensorflow.python.keras.optimizers.Adam object at 0x7f751ffc4710>) but is being saved in TensorFlow format with `save_weights`. The model's weights will be saved, but unlike with TensorFlow optimizers in the TensorFlow format the optimizer's state will not be saved.

Consider using a TensorFlow optimizer from `tf.train`.


Epoch 3/3

Consider using a TensorFlow optimizer from `tf.train`.


W0325 19:36:15.066997 140143457597312 network.py:1430] This model was compiled with a Keras optimizer (<tensorflow.python.keras.optimizers.Adam object at 0x7f751ffc4710>) but is being saved in TensorFlow format with `save_weights`. The model's weights will be saved, but unlike with TensorFlow optimizers in the TensorFlow format the optimizer's state will not be saved.

Consider using a TensorFlow optimizer from `tf.train`.




### Restoring from the last checkpoint

In [29]:
print("\n---------- restoring the model ----------\n")
print("Last checkpoint: " + str(tf.train.latest_checkpoint(CHECKPOINT_DIR)))

# rebuild the model
model = build_model(vocab_size, EMBEDDING_DIMS, RNN_UNITS, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(CHECKPOINT_DIR))
model.build(tf.TensorShape([1, None]))


---------- restoring the model ----------

Last checkpoint: /content/gdrive/My Drive/Colab/text-generator/checkpoints/epochs3_batchsize64_embeddingdims256_rnnunits1024/ckpt_3


In [30]:
print("\n---------- generating text ----------\n")
def generate_text(model, start_string):
  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing) 
  input_eval = [charsToIndices[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a multinomial distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
      
      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
      
      text_generated.append(indicesToChars[predicted_id])

  return (start_string + ''.join(text_generated))


print(generate_text(model, start_string=u"ROMEO: "))

# The easiest thing you can do to improve the results it to train it for longer (try EPOCHS=30).
# You can also experiment with a different start string, or try adding another RNN layer to improve the model's accuracy, or adjusting the temperature parameter to generate more or less random predictions.
# more stuff on the 


---------- generating text ----------

Instructions for updating:
Use tf.random.categorical instead.


W0325 19:36:15.652512 140143457597312 deprecation.py:323] From <ipython-input-30-5faa2ae923d2>:27: multinomial (from tensorflow.python.ops.random_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.random.categorical instead.


ROMEO: Hoo! a cold contents to know,
Which to death blood wrong is:
Work you are rottends on him. But, thup
I cannot speak oof do, who is't
With with their virtue to this be disitur:
And, Greatful unkness; vell noisent to que, not long,
May hereag onto conget ds.
And in the regain to Firds will a listed,
Thinks he whose tonguel and mine heart:
And weary to Norfolks, his cast
For he what I stay undert ill heaven,
So never brideg-sir.

PORLAGH:
Wear officer:
Madam, you shall have mercy: if, white,
And many ore, hast the exam'd:
How sweet man, before my cripless. Which boy of this night
We was are my place,
And respaish; And that I shows my dames,--afish five:
Now Lomy
That there was my side me so your loves:
Not not flesh and thing; whose blue of him to fold this wife and my husendershio's wife,
To an in coman, our sean to Frents?

Whereof be a ragery too:
The heavens manner on thirgh, disconturness from many forth one the trwick's,
Now, Blook their assigg'd: the hours of the subfence
To