In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import tensorflow as tf

In [4]:
path_to_file = './resources/06-NLP-and-Text-Data/shakespeare.txt'

In [5]:
text = open(path_to_file, 'r').read()

print(text[4500:4800])

converted are
  From his low tract and look another way:
    So thou, thy self out-going in thy noon:
    Unlooked on diest unless thou get a son.


                     8
  Music to hear, why hear'st thou music sadly?
  Sweets with sweets war not, joy delights in joy:
  Why lov'st thou that which t


In [6]:
vocab = sorted(set(text))

In [7]:
len(vocab)

84

In [8]:
char_to_ind = { char: ind for ind, char in enumerate(vocab)}

ind_to_char = np.array(vocab)

In [9]:
char_to_ind['H']

33

In [10]:
ind_to_char[33]

'H'

In [11]:
encoded_text = np.array([char_to_ind[c] for c in text])

In [12]:
print(text[:100])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose mi


In [13]:
encoded_text[:100]

array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
       73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,
        1, 59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,
        0,  1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57,
       60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64])

In [14]:
seq_len = 120

In [15]:
total_num_seq = len(text) // (seq_len + 1)

In [16]:
total_num_seq

45005

In [17]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [18]:
type(char_dataset)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [19]:
# for item in char_dataset.take(500):
#     print(item.numpy())

In [20]:
sequences = char_dataset.batch(seq_len + 1, drop_remainder=True)

In [21]:
def create_seq_target(sequence):
    input_text = sequence[:-1] # Hello my name i
    target_text = sequence[1:] # ello my name is
    return input_text, target_text

In [22]:
dataset = sequences.map(create_seq_target)

In [23]:
for input_txt, target_txt in dataset.take(1):
    print(input_txt.numpy())
    print("".join(ind_to_char[input_txt.numpy()]))
    print("\n")
    print(target_txt.numpy())
    print("".join(ind_to_char[target_txt.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 


In [24]:
batch_size = 128

In [25]:
buffer_size = 10000
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [26]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

In [27]:
vocab_size = len(vocab)
embedding_dim = 64

In [28]:
rnn_neurons = 1026

In [29]:
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [30]:
def sparse_cat_loss(y_true, y_preds):
    return sparse_categorical_crossentropy(y_true, y_preds, from_logits=True)

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [32]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    
    model.add(Embedding(vocab_size, embed_dim, batch_input_shape=[batch_size, None]))
    model.add(GRU(rnn_neurons, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    
    model.compile(optimizer='adam', loss=sparse_cat_loss)
    
    return model

In [33]:
model = create_model(vocab_size, embedding_dim, rnn_neurons, batch_size)

In [34]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           5376      
_________________________________________________________________
gru (GRU)                    (128, None, 1026)         3361176   
_________________________________________________________________
dense (Dense)                (128, None, 84)           86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(dataset, epochs=2, verbose=1)

Epoch 1/2
