In [1]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

For this example, we only need one piece of training data. We can also write our own poem or play and pass it to the 
network for training if we like. However, here we will be using Shakespeare's play 'Romeo and Juliet'

In [2]:
#keras has this feature to save it as txt
path_to_file = tf.keras.utils.get_file('shakespeare.txt','https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [3]:
#Read, then decode
text = open(path_to_file,'rb').read().decode(encoding='utf-8')
#length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [4]:
print(text[:250]) #first 250 characters in the play

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



Encoding
- We encode each unique character as a different integer

In [5]:
vocab = sorted(set(text))

In [6]:
print(type(vocab))
print(vocab)
print(len(vocab))

<class 'list'>
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


In [7]:
#Create a mapping from unique characters to indices
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [8]:
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [9]:
#function to convert numeric values to text
def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])
print(int_to_text(text_as_int[:13]))

First Citizen


Creating Training Examples
- Our task is to feed the model a sequence and have it return to us the next character
- THis means, we need to split our text data from above into many shorter sequences that we can pass to the model as training example
- We will use a sequence as input and another sequence as output, where this o/p sequence is the original sequence shifted one letter to the right
- e.g., i/p: Hell and o/p: ello

First step is to create a stream of characters from our text data

In [10]:
seq_length = 100 # length of sequence for training example
examples_per_epoch = len(text)//(seq_length + 1) #for every training example, we use (seq_length + 1) characters as i/p and o/p combined

#Creating training examples/targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

Now, we use the batch method to turn this stream of characters into batches of desired length

In [11]:
sequences = char_dataset.batch(seq_length+1, drop_remainder = True) 
#drop_reamainder will drop the remaining characters at the end, that can't be included in the batch of size 101 here

Use these sequences of length 101 and split them into input and output

In [12]:
def split_input_target(chunk):
    input_text = chunk[:-1] #hell
    target_text = chunk[1:] #ello
    return input_text, target_text #hell, ello

dataset = sequences.map(split_input_target) # we use map to apply the above function to every entry

In [13]:
type(dataset)

tensorflow.python.data.ops.dataset_ops.MapDataset

In [14]:
for x,y in dataset.take(2):
    print("\n\nEXAMPLE")
    print('INPUT: ',int_to_text(x))
    print('\nOUTPUT: ',int_to_text(y))



EXAMPLE
INPUT:  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUTPUT:  irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


EXAMPLE
INPUT:  are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUTPUT:  re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


FInally, we need to make training batches

In [15]:
BATCH_SIZE = 64 #each epoch split into these many different batches
VOCAB_SIZE = len(vocab) #number of unique characters
EMBEDDING_DIM = 256 #Embedding dimension is how big we want every single vector to represent characters/words????
RNN_UNITS = 1024

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences
# so it doesn't attempt to shuffle the entire sequence in memory
# it maintains a buffer in which it shuffle elements)
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

Building the Model
- We will use embedding, LSTM and a dense layer 
- Dense layer contains a node for each unique character in our training data. 
- The dense layer will give us a prob. distr. over all nodes

In [16]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
        #while making predictions, we donno how long each sequence is gonna be. So we give shape 'None', i.e., it is the length of sequence
        tf.keras.layers.LSTM(rnn_units, return_sequences = True, stateful = True, recurrent_initializer = 'glorot_uniform'),
        # return_sequences = True, will give us output at every single time step, if set to False, only final output will be given
        # glorot_uniform is a good default to pick for the values to start at in the LSTM
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


Creating A Loss Function
- Our Model will o/p (64,sequence_length,65) shaped tensor that represents the prob. distr. of each character, at each 
timestep, for every sequence in the batch

In [17]:
for input_example_batch, target_example_batch in data.take(1):
    example_batch_predictions = model(input_example_batch) #ask our model for a prediction on our first batch of training data
    print(example_batch_predictions.shape, '#(batch_size, sequence_length, vocab_length)')

(64, 100, 65) #(batch_size, sequence_length, vocab_length)


In [19]:
#we can see that prediction is a collection of 64 (100,65) arrays, i.e., one for each element in the batch
print(len(example_batch_predictions))
print(example_batch_predictions)

64
tf.Tensor(
[[[ 1.52606273e-03  4.39784210e-03  1.56661635e-03 ... -6.78597484e-04
   -5.33037540e-03 -3.64726176e-04]
  [ 2.29885569e-03  6.65342610e-04 -1.71128614e-03 ... -1.36823836e-03
   -1.78331695e-03 -3.12160864e-03]
  [-3.25368810e-03 -5.23829740e-03 -2.13816017e-03 ...  6.80872519e-03
   -4.13131481e-03 -4.46233992e-03]
  ...
  [-1.77805428e-03 -4.60925838e-03 -1.15368022e-02 ...  4.41238331e-03
    4.86281933e-03 -3.39989970e-03]
  [ 7.27860257e-04 -6.06255955e-04 -8.13094806e-03 ...  2.84038065e-03
    6.77505042e-03 -5.60484175e-03]
  [-6.03105640e-04  2.19245767e-03 -5.47381351e-04 ...  8.92934622e-04
    8.70068651e-03 -3.13523086e-03]]

 [[-2.75722472e-03  1.72068668e-03  7.96484761e-04 ...  2.91817565e-03
    5.79182478e-03 -9.55520722e-04]
  [-1.04677130e-03  4.50261915e-03  5.17377164e-04 ...  1.08733680e-03
    6.44883793e-03 -3.06472182e-03]
  [ 2.37814325e-04  7.31977355e-03  8.46851617e-05 ...  4.08830121e-04
    7.04095466e-03 -4.47180681e-03]
  ...
  [-2.653

In [20]:
#examine one prediction (one 2d array)
pred = example_batch_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[ 0.00152606  0.00439784  0.00156662 ... -0.0006786  -0.00533038
  -0.00036473]
 [ 0.00229886  0.00066534 -0.00171129 ... -0.00136824 -0.00178332
  -0.00312161]
 [-0.00325369 -0.0052383  -0.00213816 ...  0.00680873 -0.00413131
  -0.00446234]
 ...
 [-0.00177805 -0.00460926 -0.0115368  ...  0.00441238  0.00486282
  -0.0033999 ]
 [ 0.00072786 -0.00060626 -0.00813095 ...  0.00284038  0.00677505
  -0.00560484]
 [-0.00060311  0.00219246 -0.00054738 ...  0.00089293  0.00870069
  -0.00313523]], shape=(100, 65), dtype=float32)


In [21]:
#prediction at the first step
time_pred = pred[0]
print(len(time_pred))
print(time_pred)
#its 65 values represent the probability of each character occuring next

65
tf.Tensor(
[ 1.5260627e-03  4.3978421e-03  1.5666164e-03 -3.3744862e-03
 -2.4987098e-03  3.0013584e-03 -7.8515004e-04 -5.5566966e-03
 -4.5616589e-03  1.5471394e-03 -4.5454642e-04 -2.0724447e-03
 -3.7062885e-03 -7.7394610e-03  9.5157546e-04 -6.8976851e-03
 -1.8667595e-03  7.4679416e-04 -3.0121536e-03  4.7531873e-03
  7.7455101e-04  1.9817087e-03  5.2709780e-03 -3.2888567e-03
  6.3460303e-04 -3.3614137e-03 -1.7593361e-03  3.8476447e-03
 -7.0484541e-04 -3.1365897e-03  1.6508361e-03  3.6797517e-03
 -1.1645490e-03  2.7079117e-03 -6.7240972e-04 -6.3039362e-03
 -3.3678252e-03 -4.5089750e-05 -1.8789032e-03 -2.3338529e-03
 -1.0582742e-03  4.0736998e-04  4.7858246e-03  1.2775387e-03
  2.8697855e-03  1.9102667e-03 -8.1526570e-04 -2.1858558e-03
 -2.2530376e-03 -3.3462343e-03 -1.1918263e-03  2.2916633e-03
 -2.5648635e-03 -8.4485195e-04 -4.4924663e-03  2.8814538e-03
 -2.1200345e-03  1.2963250e-03 -2.4628232e-04  2.2931132e-03
  4.8260868e-04  6.1183312e-04 -6.7859748e-04 -5.3303754e-03
 -3.647261

In [25]:
#if we want to determine the predicted character, we need to sample the output distribution (pick a value based on prob. distr.)
sampled_indices = tf.random.categorical(pred, num_samples = 1) #draws 1 sample from a categorical distribution of 65 elements
#here characters are picked not based on highest prob. but uses a prob. distr. to pick it
print(sampled_indices)

#now we can reshape that array and convert all integers to numbers to see the actual characters
sampled_indices = np.reshape(sampled_indices, (1,-1))[0]
print(sampled_indices)

predicted_chars = int_to_text(sampled_indices)

predicted_chars #this is what the model predicted for training sequence 1

tf.Tensor(
[[18]
 [18]
 [31]
 [54]
 [32]
 [64]
 [15]
 [56]
 [39]
 [34]
 [14]
 [ 2]
 [ 6]
 [44]
 [ 7]
 [52]
 [11]
 [60]
 [19]
 [24]
 [26]
 [20]
 [46]
 [ 6]
 [59]
 [ 7]
 [36]
 [35]
 [25]
 [16]
 [35]
 [41]
 [12]
 [47]
 [63]
 [35]
 [33]
 [41]
 [54]
 [27]
 [39]
 [17]
 [55]
 [56]
 [24]
 [ 2]
 [43]
 [ 5]
 [ 1]
 [12]
 [ 7]
 [63]
 [ 3]
 [55]
 [17]
 [55]
 [55]
 [43]
 [38]
 [12]
 [48]
 [62]
 [38]
 [51]
 [49]
 [ 3]
 [36]
 [34]
 [ 4]
 [54]
 [53]
 [32]
 [ 2]
 [37]
 [55]
 [37]
 [ 0]
 [37]
 [14]
 [60]
 [ 3]
 [37]
 [14]
 [29]
 [39]
 [53]
 [46]
 [ 8]
 [ 5]
 [ 7]
 [39]
 [54]
 [64]
 [ 1]
 [25]
 [ 8]
 [34]
 [52]
 [ 3]
 [46]], shape=(100, 1), dtype=int64)
[18 18 31 54 32 64 15 56 39 34 14  2  6 44  7 52 11 60 19 24 26 20 46  6
 59  7 36 35 25 16 35 41 12 47 63 35 33 41 54 27 39 17 55 56 24  2 43  5
  1 12  7 63  3 55 17 55 55 43 38 12 48 62 38 51 49  3 36 34  4 54 53 32
  2 37 55 37  0 37 14 60  3 37 14 29 39 53 46  8  5  7 39 54 64  1 25  8
 34 52  3 46]


"FFSpTzCraVB!,f-n;vGLNHh,u-XWMDWc?iyWUcpOaEqrL!e' ?-y$qEqqeZ?jxZmk$XV&poT!YqY\nYBv$YBQaoh.'-apz M.Vn$h"

- Now we need to create a loss function that can compare that output to the expected output and give us some numeric value representing how close the two were

In [26]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)