In [1]:
!pip install -q tf-nightly

In [2]:
import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import os
import time

# Importing the Dataset

In [3]:
def import_text(data_path):
    text = open(data_path).readlines()
    # Convert our text to all lowercase
    # text = text.lower()
    
    #print("Our text has {} characters:".format(len(text)))
    # Show first 100 characters
    
    
    text = ["{} \n".format(line[:-1]) for line in text]
    text = " ".join(text)
    print(text[:10] + "...")
    
    return text

In [4]:
# Get the filepath and load the text in
training_data_path = './data2/training_data.txt'
validation_data_path = './data2/validation_data.txt'

training_text = import_text(training_data_path)
validation_text = import_text(validation_data_path)

Roses are ...
I love you...


In [5]:
training_text_split = training_text.split(' ')
validation_text_split = validation_text.split(' ')

In [6]:
# Get all unique characters from our file
vocab = sorted(set(training_text_split + validation_text_split))
print('We have {} unique words in our dataset:'.format(len(vocab)))
print(vocab)

We have 1703 unique words in our dataset:


In [7]:
# Make a dictionary between each unique characters and an 
# index (e.g. 'a' maps to 1, 'b' maps to 2, etc)
word2idx = {word:index for index, word in enumerate(vocab)}
idx2word = np.array(vocab)

training_text_as_idx = np.array([word2idx[w] for w in training_text_split])
validation_text_as_idx = np.array([word2idx[w] for w in validation_text_split])

In [8]:
# The maximum length sentence (number of characters) we want 
# for a single input of data in our model.
# The bigger it is, the more evidence our model has
seq_length = 16

# Number of batches before we finish 1 epoch (training on all data once)
examples_per_epoch = len(training_text) // seq_length

print(examples_per_epoch)

2518


In [9]:
# Each input is 128 characters (as defined in seq_length). But we can train in batches of multiple inputs. 
# BATCH_SIZE = how many inputs to train on at once

BATCH_SIZE = 16
steps_per_epoch = examples_per_epoch // BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 100

In [10]:
# Machine learning models predict things by training on data.
# In our case, given a bunch of characters, we try to predict the 
# next letter to match our training data.
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text


In [13]:
def gen_dataset(text_as_idx):
    # Make a tensorflow dataset
    word_dataset = tf.data.Dataset.from_tensor_slices(text_as_idx)

    # Split the data into batches
    sequences = word_dataset.batch(seq_length+1, drop_remainder=True)
    for item in sequences.take(1):
        print(repr(' '.join(idx2word[item.numpy()])))


    # We split the dataset into evidence (characters we know) and 
    # targets (the next character to predict) using the function defined above
    dataset = sequences.map(split_input_target)
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

    return dataset

In [14]:
training_dataset = gen_dataset(training_text_as_idx)
validation_dataset = gen_dataset(validation_text_as_idx)

'Roses are red \n Violets are blue \n I hope you love me \n Cause I love'
"I love you like no other \n And even if you're not here to see \n Across"


# Modelling

In [15]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension 
embedding_dim = 1024

# Number of RNN units
rnn_units = 2048

DROPOUT_PROB=.2

In [16]:
# Don't worry about it. Basically, if we're using a graphics card, we can apply special optimizations.

In [17]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            recurrent_initializer='glorot_uniform',
                            activation='relu',
                            stateful=True, 
                            recurrent_activation='hard_sigmoid',
                            use_bias=True,
                            dropout=DROPOUT_PROB,
                            recurrent_dropout=DROPOUT_PROB
                           ),
        tf.keras.layers.Dense(128, activation='relu', use_bias=True),
        tf.keras.layers.Dropout(DROPOUT_PROB),
        tf.keras.layers.Dense(vocab_size, use_bias=True)
    ])
    
    return model

In [18]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = BATCH_SIZE)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [19]:
for input_example_batch, target_example_batch in validation_dataset.take(1): 
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(16, 16, 1703) # (batch_size, sequence_length, vocab_size)


In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (16, None, 1024)          1743872   
_________________________________________________________________
gru (GRU)                    (16, None, 2048)          18880512  
_________________________________________________________________
dense (Dense)                (16, None, 128)           262272    
_________________________________________________________________
dropout (Dropout)            (16, None, 128)           0         
_________________________________________________________________
dense_1 (Dense)              (16, None, 1703)          219687    
Total params: 21,106,343
Trainable params: 21,106,343
Non-trainable params: 0
_________________________________________________________________


In [21]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [22]:
example_batch_loss  = loss(target_example_batch, example_batch_predictions)

In [23]:
example_batch_loss.shape

TensorShape([Dimension(16), Dimension(16)])

In [24]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [25]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

tensorboard_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=20,
                                                   batch_size=BATCH_SIZE, write_graph=True, 
                                                   write_grads=True, write_images=False, 
                                                   embeddings_freq=20, update_freq='batch')

In [26]:
steps_per_epoch

157

In [None]:
EPOCHS = 30
history = model.fit(training_dataset.repeat(),
                    epochs=EPOCHS,
                    steps_per_epoch=steps_per_epoch,
                    callbacks=[checkpoint_callback, tensorboard_callback],
                    validation_data=validation_dataset.repeat(),
                    validation_steps=10)

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

# Making Poems

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

model.summary()

In [None]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)
    # Number of characters to generate
    num_generate = 1000

    # You can change the start string to experiment
    start_string = 'roses'
  
    # Converting our start string to numbers (vectorizing) 
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
  
    # Empty string to store our results
    text_generated = []
  
    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0
  
    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)
  
        # using a multinomial distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
        
        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated.append(idx2char[predicted_id])
  
    return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string="roses"))