In [33]:
# code written following the TensorFlow text generation tutorial: https://www.tensorflow.org/tutorials/text/text_generation

import tensorflow as tf

In [2]:
import keras

Using TensorFlow backend.


In [3]:
import pandas

In [4]:
import sklearn

In [5]:
import matplotlib

In [6]:
print ("TensorFlow version: " + tf.__version__)

TensorFlow version: 2.0.0


In [7]:
import numpy as np
import os
import time

In [8]:
path = "/Users/Charlie/Desktop/input.txt"
text = open(path, 'rb').read().decode(encoding='utf-8')
print ('Length of text: {} characters'.format(len(text)))

Length of text: 234825 characters


In [9]:
print(text[:250])

A hundred years ago

there were one and a half billion people on Earth.

Now, over six billion crowd our fragile planet.

But even so, there are still places barely touched by humanity.

This series will take to the last wildernesses

and show you th


In [10]:
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

77 unique characters


In [11]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [12]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '"' :   3,
  '%' :   4,
  "'" :   5,
  '(' :   6,
  ')' :   7,
  ',' :   8,
  '-' :   9,
  '.' :  10,
  '0' :  11,
  '1' :  12,
  '2' :  13,
  '3' :  14,
  '4' :  15,
  '5' :  16,
  '6' :  17,
  '7' :  18,
  '8' :  19,
  ...
}


In [13]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'A hundred yea' ---- characters mapped to int ---- > [23  1 55 68 61 51 65 52 51  1 72 52 48]


In [14]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

A
 
h
u
n


In [15]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'A hundred years ago\n\nthere were one and a half billion people on Earth.\n\nNow, over six billion crowd '
'our fragile planet.\n\nBut even so, there are still places barely touched by humanity.\n\nThis series wil'
'l take to the last wildernesses\n\nand show you the planet and its wildlife\n\nas you have never seen the'
'm before.\n\nImagine our world without sun.\n\nMale Emperor Penguins are facing the nearest that exists o'
"n planet Earth -\n\nwinter in Antarctica.\n\nIt's continuously dark\n\nand temperatures drop to minus seven"


In [16]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [17]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'A hundred years ago\n\nthere were one and a half billion people on Earth.\n\nNow, over six billion crowd'
Target data: ' hundred years ago\n\nthere were one and a half billion people on Earth.\n\nNow, over six billion crowd '


In [18]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 23 ('A')
  expected output: 1 (' ')
Step    1
  input: 1 (' ')
  expected output: 55 ('h')
Step    2
  input: 55 ('h')
  expected output: 68 ('u')
Step    3
  input: 68 ('u')
  expected output: 61 ('n')
Step    4
  input: 61 ('n')
  expected output: 51 ('d')


In [19]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [20]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [21]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [22]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [23]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 77) # (batch_size, sequence_length, vocab_size)


In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           19712     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 77)            78925     
Total params: 4,036,941
Trainable params: 4,036,941
Non-trainable params: 0
_________________________________________________________________


In [25]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [26]:
sampled_indices

array([ 5, 64, 72, 18, 34, 73, 73, 48,  6, 71, 38, 76, 37, 26, 60,  3,  7,
       56, 10, 34, 20,  4, 19, 53, 14, 30, 72, 37, 54, 68,  8, 57, 24, 45,
       54, 35, 67, 11, 31, 31, 13,  8, 73, 26, 37, 69, 14, 28,  2, 57, 34,
       13, 38, 36, 46, 68, 69, 56, 50, 29, 64, 50, 48, 46, 74, 12,  0, 25,
       18,  1, 63, 55, 22, 68, 37, 63, 41,  7, 34, 30, 57, 33, 47, 51, 69,
        4, 11, 62, 58, 15, 76, 24, 21, 59, 11, 42, 27, 54, 14, 55])

In [27]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 "musk oxen,\n\nwhose entourage grows throughout the day.\n\nThis odd assembly of vegetarians\ndoesn't go u"

Next Char Predictions: 
 '\'qy7Lzza(xP—ODm")i.L9%8f3HyOgu,jBWgMt0II2,zDOv3F!jL2PNYuvicGqcaY{1\nC7 ph?uOpS)LHjKZdv%0ok4—B:l0TEg3h'


In [28]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 77)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.3446827


In [29]:
model.compile(optimizer='adam', loss=loss)

In [30]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [31]:
EPOCHS=45

In [32]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45


In [34]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_45'

In [35]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [36]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            19712     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 77)             78925     
Total params: 4,036,941
Trainable params: 4,036,941
Non-trainable params: 0
_________________________________________________________________


In [37]:

def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 3000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 0.6

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [38]:
print(generate_text(model, start_string=u"Here "))

Here is now hidden blizzards,

these extraordinary forests
spring up throughout the winter bats,

as this thermal image shows.

To the snakes, the bats are
apparently glowing ...

and that's enough to see sheer continent
with blizzard.

It's a remarkable skill and one
we still do not fully understand.

The flock stay in the the richest places on Earth.

It's far are home
to the sea ice.

The decade us the limits of the longest.

Unable to go the distance,

his swim,
the biggest conditions on Earth.

To reach their over a mile deep.

The eeriat has such powerful
eroded reason so many...

...from far and wide.

They've come to make the most of the brief Antarctic summer.

But one creature is just arriving.

Every winter, emperor penguins
leave the sea and emerging from the sea floor,
are taller than Earth.

These are the cubs are forced
to leave the safety of the thorns.

But the hawks have a tactic
to flush their prey into the open.

And these spires are permanent residents,
but they ha