In [1]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

For this example, we only need one piece of training data. We can also write our own poem or play and pass it to the
network for training if we like. However, here we will be using Shakespeare's play 'Romeo and Juliet'

In [2]:
#keras has this feature to save it as txt
path_to_file = tf.keras.utils.get_file('shakespeare.txt','https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [3]:
#Read, then decode
text = open(path_to_file,'rb').read().decode(encoding='utf-8')
#length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [4]:
print(text[:250]) #first 250 characters in the play

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



Encoding
- We encode each unique character as a different integer

In [5]:
vocab = sorted(set(text))

In [6]:
print(type(vocab))
print(vocab)
print(len(vocab))

<class 'list'>
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


In [7]:
#Create a mapping from unique characters to indices
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [8]:
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [9]:
#function to convert numeric values to text
def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])
print(int_to_text(text_as_int[:13]))

First Citizen


Creating Training Examples
- Our task is to feed the model a sequence and have it return to us the next character
- THis means, we need to split our text data from above into many shorter sequences that we can pass to the model as training example
- We will use a sequence as input and another sequence as output, where this o/p sequence is the original sequence shifted one letter to the right
- e.g., i/p: Hell and o/p: ello

First step is to create a stream of characters from our text data

In [10]:
seq_length = 100 # length of sequence for training example
examples_per_epoch = len(text)//(seq_length + 1) #for every training example, we use (seq_length + 1) characters as i/p and o/p combined

#Creating training examples/targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

Now, we use the batch method to turn this stream of characters into batches of desired length

In [11]:
sequences = char_dataset.batch(seq_length+1, drop_remainder = True)
#drop_reamainder will drop the remaining characters at the end, that can't be included in the batch of size 101 here

Use these sequences of length 101 and split them into input and output

In [12]:
def split_input_target(chunk):
    input_text = chunk[:-1] #hell
    target_text = chunk[1:] #ello
    return input_text, target_text #hell, ello

dataset = sequences.map(split_input_target) # we use map to apply the above function to every entry

In [13]:
type(dataset)

tensorflow.python.data.ops.map_op._MapDataset

In [14]:
for x,y in dataset.take(2):
    print("\n\nEXAMPLE")
    print('INPUT: ',int_to_text(x))
    print('\nOUTPUT: ',int_to_text(y))



EXAMPLE
INPUT:  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT:  irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


EXAMPLE
INPUT:  are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT:  re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


FInally, we need to make training batches

In [15]:
BATCH_SIZE = 64 #each epoch split into these many different batches
VOCAB_SIZE = len(vocab) #number of unique characters
EMBEDDING_DIM = 256 #Embedding dimension is how big we want every single vector to represent characters/words????
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences
# so it doesn't attempt to shuffle the entire sequence in memory
# it maintains a buffer in which it shuffle elements)
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

Building the Model
- We will use embedding, LSTM and a dense layer
- Dense layer contains a node for each unique character in our training data.
- The dense layer will give us a prob. distr. over all nodes

In [16]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
        #while making predictions, we donno how long each sequence is gonna be. So we give shape 'None', i.e., it is the length of sequence
        tf.keras.layers.LSTM(rnn_units, return_sequences = True, stateful = True, recurrent_initializer = 'glorot_uniform'),
        # return_sequences = True, will give us output at every single time step, if set to False, only final output will be given
        # glorot_uniform is a good default to pick for the values to start at in the LSTM
        tf.keras.layers.Dense(vocab_size)
    ])

    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


Creating A Loss Function
- Our Model will o/p (64,sequence_length,65) shaped tensor that represents the prob. distr. of each character, at each
timestep, for every sequence in the batch

In [17]:
for input_example_batch, target_example_batch in data.take(1):
    example_batch_predictions = model(input_example_batch) #ask our model for a prediction on our first batch of training data
    print(example_batch_predictions.shape, '#(batch_size, sequence_length, vocab_length)')

(64, 100, 65) #(batch_size, sequence_length, vocab_length)


In [18]:
#we can see that prediction is a collection of 64 (100,65) arrays, i.e., one for each element in the batch
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[ 9.50125419e-03  1.19565462e-03 -1.13774498e-03 ... -2.88806623e-03
    4.05510468e-03  1.68394786e-03]
  [ 7.04359263e-03  6.35429146e-03 -1.21229037e-03 ... -3.40874493e-03
    1.07180160e-02  5.95949823e-04]
  [ 9.65206511e-03  1.21970885e-02  5.27461991e-03 ... -3.05358844e-04
    9.05109383e-03  5.96746569e-03]
  ...
  [-7.02340435e-03  1.39329629e-03 -2.84144143e-03 ...  6.72004605e-03
    2.62930267e-03  4.81669605e-03]
  [-1.03976931e-02  6.58749836e-04 -1.00462250e-02 ...  6.62025157e-03
   -1.18605653e-03  1.47616351e-03]
  [-3.17057944e-03  8.26543663e-03 -3.56819015e-03 ...  5.23980195e-03
    2.66896561e-04  7.35626929e-03]]

 [[-1.95488939e-03 -1.15381565e-03 -1.83486857e-03 ...  2.60391715e-03
    2.72769085e-03 -6.66618580e-04]
  [-2.65020411e-03 -3.17220110e-06 -1.75666250e-03 ... -2.02989695e-03
    7.07071926e-03 -3.23510240e-03]
  [-8.36339965e-03 -2.30313832e-04 -1.26838195e-03 ...  4.15644376e-03
    4.79529332e-03 -2.00824626e-03]
  ...
  [-7.630

In [19]:
#examine one prediction (one 2d array)
pred = example_batch_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[ 0.00950125  0.00119565 -0.00113774 ... -0.00288807  0.0040551
   0.00168395]
 [ 0.00704359  0.00635429 -0.00121229 ... -0.00340874  0.01071802
   0.00059595]
 [ 0.00965207  0.01219709  0.00527462 ... -0.00030536  0.00905109
   0.00596747]
 ...
 [-0.0070234   0.0013933  -0.00284144 ...  0.00672005  0.0026293
   0.0048167 ]
 [-0.01039769  0.00065875 -0.01004623 ...  0.00662025 -0.00118606
   0.00147616]
 [-0.00317058  0.00826544 -0.00356819 ...  0.0052398   0.0002669
   0.00735627]], shape=(100, 65), dtype=float32)


In [20]:
#prediction at the first step
time_pred = pred[0]
print(len(time_pred))
print(time_pred)
#its 65 values represent the probability of each character occuring next

65
tf.Tensor(
[ 9.5012542e-03  1.1956546e-03 -1.1377450e-03  4.7183610e-03
  5.5644722e-03  3.4959055e-03 -3.2487097e-03 -8.9778390e-04
  1.0870774e-03 -2.2877681e-03 -2.0703176e-04  1.1756544e-03
 -3.7854617e-03 -1.4933019e-03  5.2501149e-03 -2.3365486e-05
  1.7615437e-04 -3.8217758e-03 -5.4644244e-03  2.3330003e-04
 -1.6296285e-03 -9.1430591e-04  5.7554460e-04 -4.2708311e-04
 -2.6888244e-03  5.7822571e-04  1.0356912e-03 -3.4779133e-03
 -5.2930564e-03  4.5891386e-03  2.7047569e-04  9.3190465e-05
  2.0134468e-03 -3.9827549e-03 -2.1932835e-03  1.4454343e-03
 -3.3242698e-04 -4.2910138e-03 -1.9990115e-03  2.8264048e-03
 -3.2877347e-03 -3.5806887e-03 -3.9789202e-03 -6.0600070e-03
  1.4311330e-03  1.0862820e-03 -4.5173243e-03 -1.3049300e-03
  2.4285729e-03  1.3694715e-03 -2.4794717e-03 -3.7528961e-03
 -6.8914741e-03 -1.4019492e-03  1.1776872e-03 -6.4224540e-04
  7.7219796e-03 -1.9571960e-03  2.5359516e-03  7.3621408e-03
 -1.7208904e-03  7.7625655e-04 -2.8880662e-03  4.0551047e-03
  1.683947

In [21]:
#if we want to determine the predicted character, we need to sample the output distribution (pick a value based on prob. distr.)
sampled_indices = tf.random.categorical(pred, num_samples = 1) #draws 1 sample from a categorical distribution of 65 elements
#here characters are picked not based on highest prob. but uses a prob. distr. to pick it
print(sampled_indices)

#now we can reshape that array and convert all integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1,-1))[0]
print(sampled_indices)

predicted_chars = int_to_text(sampled_indices)

predicted_chars #this is what the model predicted for training sequence 1

tf.Tensor(
[[55]
 [48]
 [29]
 [34]
 [ 1]
 [63]
 [ 2]
 [12]
 [47]
 [16]
 [ 0]
 [26]
 [35]
 [14]
 [57]
 [ 7]
 [42]
 [51]
 [ 0]
 [16]
 [62]
 [ 3]
 [23]
 [39]
 [57]
 [ 2]
 [11]
 [14]
 [56]
 [25]
 [57]
 [55]
 [24]
 [29]
 [61]
 [ 2]
 [48]
 [44]
 [62]
 [18]
 [24]
 [ 3]
 [ 4]
 [10]
 [28]
 [52]
 [37]
 [64]
 [50]
 [10]
 [24]
 [23]
 [40]
 [58]
 [22]
 [47]
 [ 6]
 [64]
 [25]
 [26]
 [43]
 [19]
 [31]
 [ 0]
 [ 8]
 [31]
 [40]
 [ 9]
 [57]
 [ 1]
 [17]
 [ 6]
 [ 1]
 [ 8]
 [ 3]
 [18]
 [ 5]
 [45]
 [54]
 [57]
 [54]
 [51]
 [41]
 [31]
 [38]
 [16]
 [18]
 [ 3]
 [10]
 [16]
 [24]
 [64]
 [ 4]
 [15]
 [22]
 [21]
 [49]
 [42]
 [ 8]
 [57]], shape=(100, 1), dtype=int64)
[55 48 29 34  1 63  2 12 47 16  0 26 35 14 57  7 42 51  0 16 62  3 23 39
 57  2 11 14 56 25 57 55 24 29 61  2 48 44 62 18 24  3  4 10 28 52 37 64
 50 10 24 23 40 58 22 47  6 64 25 26 43 19 31  0  8 31 40  9 57  1 17  6
  1  8  3 18  5 45 54 57 54 51 41 31 38 16 18  3 10 16 24 64  4 15 22 21
 49 42  8 57]


"qjQV y!?iD\nNWBs-dm\nDx$Kas!;BrMsqLQw!jfxFL$&:PnYzl:LKbtJi,zMNeGS\n.Sb3s E, .$F'gpspmcSZDF$:DLz&CJIkd.s"

- Now we need to create a loss function that can compare that output to the expected output and give us some numeric value representing how close the two were

In [22]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

Compiling The Model

In [23]:
model.compile(optimizer = 'adam', loss = loss)

- we are going to setup and configure our model to save checkpoint as it trains. This will allow us to load our model from a checkpoint and continue saving it

In [24]:
#Directory where checkpoints will be saved
checkpoint_dir = './training_checkpoints'
#Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}') #to save checkpoints for each epoch we run 'ckpt_{epoch}' is the prefix

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix, save_weights_only = True)

Training the model

In [25]:
history = model.fit(data, epochs=40, callbacks = checkpoint_callback)#epochs = 40, the more the epochs, the better

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


We'll rebuild the model from a checkpoint using a batch size of 1 so that we can feed one piece of text to the model and make it predict

In [26]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size = 1)

Once the model is finished training, we can find the latest checkpoint that stores the model weights using the following line

In [27]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None])) #[1,None] means input is length 1 and we don't know the 2nd dimension

We can use the below function provided by tensorflow to generate some text using any starting string we like

In [28]:
def generate_text(model, start_string):
  #Evaluation step (generating text using the learned model)

  #Number of characters to generate
  num_generate = 800

  #Converting our start string to numbers(vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0) #e.g., [1 2 3] to [[1 2 3]] ??

  #Empty string to store our results
  text_generated = []

  #Low temperature results in more predictable text
  #High temperature results in more surprising text
  #Experiment to find the best setting
  temperature = 1.0 #not needed necessarily

  #Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    #remove batch dimension
    predictions = tf.squeeze(predictions, 0) # [[]] to []

    #using a categorical distribution to predict the character returned by the model

    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples = 1)[-1, 0].numpy()

    #we pass the predicted character as the next input to the model along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))


In [29]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))

Type a starting string: adventure
adventure's death,
He slily-but us all, that will deceive we free,
We have a power in't igan; for whom I profess, I will
pass'd the entious worm, you were in lawless horse!
Thy of that store, whose children he was crows;
All hour as this hath granted to the Tower,
Only to time; for 'tis the city of a
strong-prights and a brain:
Ay, if he would please you, this must fold,
Our word pooress,
And most assisting on the lineary in his master.

PETRUCHIO:
Ay, this is some of that safety?

First Murderer:
Take that, and tell those grave me Duke of my master's fair.

HENRY BOLINGBROKE:
My gracious uncle, let me know my state,
Ey with purgo set in her most unevent! They have ranced with no man shall
good one Marcius is our case as to stuff a to
come, thou didst usures it to my son's and blunt his lands and l
