<a href="https://colab.research.google.com/github/dfridland/NLP/blob/HW9/NLP_DF_HW9_text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import os
import time

In [None]:
is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
is_cuda_gpu_available

True

In [None]:
gpus = tf.config.list_physical_devices('GPU')
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
text = open('idiot.txt', 'rb').read().decode(encoding='utf-8')
print(text[:400])

The Idiot

by Fyodor Dostoyevsky


PART I

I.

Towards the end of November, during a thaw, at nine o’clock one
morning, a train on the Warsaw and Petersburg railway was approaching
the latter city at full speed. The morning was so damp and misty that
it was only with great difficulty that the day succeeded in breaking;
and it was impossible to distinguish anything more than a few yards


In [None]:
# create vocab in chars of text
vocab = sorted(set(text))
len(vocab)

94

In [None]:
# create mapping a chars
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])
len(text_as_int)

1393505

In [None]:
len_seq = 150
examples_per_epoch = len(text)//(len_seq + 1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

T
h
e
 
I


In [None]:
sequences = char_dataset.batch(len_seq + 1, drop_remainder = True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'The Idiot\r\n\r\nby Fyodor Dostoyevsky\r\n\r\n\nPART I\r\n\r\nI.\r\n\r\nTowards the end of November, during a thaw, at nine o’clock one\r\nmorning, a train on the Warsaw '
'and Petersburg railway was approaching\r\nthe latter city at full speed. The morning was so damp and misty that\r\nit was only with great difficulty that t'
'he day succeeded in breaking;\r\nand it was impossible to distinguish anything more than a few yards\r\naway from the carriage windows.\r\n\r\nSome of the pass'
'engers by this particular train were returning from\r\nabroad; but the third-class carriages were the best filled, chiefly\r\nwith insignificant persons of'
' various occupations and degrees, picked\r\nup at the different stations nearer town. All of them seemed weary, and\r\nmost of them had sleepy eyes and a s'


In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'The Idiot\r\n\r\nby Fyodor Dostoyevsky\r\n\r\n\nPART I\r\n\r\nI.\r\n\r\nTowards the end of November, during a thaw, at nine o’clock one\r\nmorning, a train on the Warsaw'
Target data: 'he Idiot\r\n\r\nby Fyodor Dostoyevsky\r\n\r\n\nPART I\r\n\r\nI.\r\n\r\nTowards the end of November, during a thaw, at nine o’clock one\r\nmorning, a train on the Warsaw '


In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 150), dtype=tf.int64, name=None), TensorSpec(shape=(64, 150), dtype=tf.int64, name=None))>

In [None]:
vocab_size = len(vocab)
embedding_dim = 128
rnn_units = 1024

model = tf.keras.Sequential(
         [
           tf.keras.layers.Embedding(vocab_size, embedding_dim),
           tf.keras.layers.LSTM(rnn_units, return_sequences = True),
           tf.keras.layers.Dense(vocab_size)
         ])

In [None]:
class RNNgenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, batch_size):
        super(RNNgenerator, self).__init__()
        
        self.emb = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru1 = tf.keras.layers.GRU(rnn_units, return_sequences = True, recurrent_initializer = 'glorot_uniform')
        self.gru2 = tf.keras.layers.GRU(rnn_units, return_sequences = True, recurrent_initializer = 'glorot_uniform')
        self.gru3 = tf.keras.layers.GRU(rnn_units, return_sequences = True, recurrent_initializer = 'glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
    
    def call(self, x):
        emb_x = self.emb(x)
        x1 = self.gru1(emb_x)
        x = x1
        for _ in range(3):
            x = self.gru2(x)
        x = (x + x1) / 2
        return self.fc(x)
    
model = RNNgenerator(vocab_size, embedding_dim, BATCH_SIZE)

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        
            tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),

            tf.keras.layers.LSTM(rnn_units,
                                return_sequences=True,
                                stateful=True,
                                recurrent_initializer='glorot_uniform'),

             tf.keras.layers.LSTM(rnn_units,
                                return_sequences=True,
                                stateful=True,
                                recurrent_initializer='glorot_uniform'),

            tf.keras.layers.LSTM(rnn_units,
                                return_sequences=True,
                                stateful=True,
                                recurrent_initializer='glorot_uniform'),
            tf.keras.layers.LSTM(rnn_units,
                                return_sequences=True,
                                stateful=True,
                                recurrent_initializer='glorot_uniform'),

            tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [None]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 150, 94) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (64, None, 128)           12032     
                                                                 
 lstm_7 (LSTM)               (64, None, 1024)          4722688   
                                                                 
 lstm_8 (LSTM)               (64, None, 1024)          8392704   
                                                                 
 lstm_9 (LSTM)               (64, None, 1024)          8392704   
                                                                 
 lstm_10 (LSTM)              (64, None, 1024)          8392704   
                                                                 
 lstm_11 (LSTM)              (64, None, 1024)          8392704   
                                                                 
 dense_5 (Dense)             (64, None, 94)           

In [None]:
example_batch_predictions[0]

<tf.Tensor: shape=(150, 94), dtype=float32, numpy=
array([[-5.2970708e-09, -2.3660582e-06, -3.7212228e-06, ...,
        -8.8760299e-07,  4.0124456e-07, -1.4392950e-06],
       [ 8.8843086e-07, -8.4582762e-06, -1.0565443e-05, ...,
        -4.0042592e-06,  2.4585047e-06, -5.9430004e-06],
       [ 2.4757992e-06, -2.0555362e-05, -1.7189132e-05, ...,
        -1.1777334e-05,  6.6926914e-06, -1.2729175e-05],
       ...,
       [ 5.7078735e-04,  1.0774263e-04,  7.2236895e-04, ...,
        -1.4427755e-03, -1.0840442e-03,  1.0496326e-03],
       [ 6.4854865e-04,  1.4464912e-04,  7.7500474e-04, ...,
        -1.4363336e-03, -1.0501007e-03,  1.0179465e-03],
       [ 7.2517141e-04,  1.8528110e-04,  8.3676202e-04, ...,
        -1.4271148e-03, -1.0120168e-03,  9.7194582e-04]], dtype=float32)>

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[3], num_samples = 1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

In [None]:
#  what we are giving to a non-trained model
print("Input: \n", repr("".join(idx2char[input_example_batch[3]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 ' of life, but as a whole they are accursed.\r\nThe whole tendency of our latest centuries, in its scientific and\r\nmaterialistic aspect, is most probably'

Next Char Predictions: 
 'ULêçVr[dèvppAPY2RZ\n7‘2Hv$r\'Kn\'MIf‘èvàP3gMMvrD9W*Xg\'7o*CT9ênKa"fo[7 ]3é-TD8ca)!J‘)\n6"w,L_9;k\ns]?on’7T“F’àUkd%’8e?\'z"5D“è/X2‘—s\'éà8NéW)’-n]]c_T/7wbzBzER'


In [None]:
## Train the model

In [None]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 150, 94)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.5431714


In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
### Configure checkpoints

In [None]:
!rm -rf ./training_checkpoints

In [None]:
!ls ./training_checkpoints

ls: ./training_checkpoints: No such file or directory


In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_freq=88*3,
    save_weights_only=True)

In [None]:
### Execute the training

In [None]:
# tf.config.set_visible_devices([], 'GPU')
# with tf.device('/cpu:0'):
physical_devices = tf.config.list_physical_devices('GPU'); 
tf.config.set_visible_devices(physical_devices[0], 'GPU')

In [None]:
%%time
EPOCHS = 50

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 5min 48s, sys: 1min 34s, total: 7min 23s
Wall time: 1h 22min 4s


In [None]:
## Generate text

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_50'

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (1, None, 128)            12032     
                                                                 
 lstm_12 (LSTM)              (1, None, 1024)           4722688   
                                                                 
 lstm_13 (LSTM)              (1, None, 1024)           8392704   
                                                                 
 lstm_14 (LSTM)              (1, None, 1024)           8392704   
                                                                 
 lstm_15 (LSTM)              (1, None, 1024)           8392704   
                                                                 
 lstm_16 (LSTM)              (1, None, 1024)           8392704   
                                                                 
 dense_6 (Dense)             (1, None, 94)            

In [None]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 500

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 0.5

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [None]:
text_ = generate_text(model, start_string=u"The morning was so damp and misty that ")
print(text_)

The morning was so damp and misty that the worst of it
all, to the presence of your friends, I think I ought to explain,
gentlemen, that I only did so to assert our rights, though she trembled in
all her limbs. And when the subject of this murder of the Pope of Rome is,
he will never speak to you again. She did not come here to marry Rogojin. I
dreamt of the story.

“As to the rest for an instant and then paid a very difficult and malice.

“That is proved by my peeper” my thought, enough!” he cried, suddenly. “I see I have b


In [None]:
len(text_)