In [1]:
import tensorflow as tf
import numpy as np
import os
import time

from tensorflow.keras.layers.experimental import preprocessing

In [2]:
path = tf.keras.utils.get_file(
    'shakespeare.txt', 
    'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [4]:
# Read & Decode for py2 compat
txt = open(path, 'rb').read().decode(encoding = 'utf-8')

# Length of txt is the number of characters in it
print(f'Length of text: {len(txt)} characters')

Length of text: 1115394 characters


In [6]:
# Print the first 250 characters in txt
print(txt[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [11]:
# Unique Characters in the File
vocab = sorted(set(txt))
print(f'{len(vocab)} unique characters')

65 unique characters


In [9]:
example_txts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_txts, input_encoding = 'UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [21]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary = list(vocab), 
    mask_token = None
)

In [22]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>

In [25]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [26]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [27]:
tf.strings.reduce_join(chars, axis = -1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [28]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis = -1)

In [30]:
# Prediction Task
all_ids = ids_from_chars(tf.strings.unicode_split(txt, 'UTF-8'))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>

In [31]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [32]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

F
i
r
s
t
 
C
i
t
i


In [34]:
seq_length = 100
examples_per_epoch = len(txt)//(seq_length + 1)

In [35]:
sequences = ids_dataset.batch(seq_length + 1, drop_remainder = True)

for seq in sequences.take(1):
    print(chars_from_ids(seq))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [36]:
def split_input_target(sequence):
    input_txt = sequence[:-1]
    target_txt = sequence[1:]
    
    return input_txt, target_txt

In [37]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [38]:
dataset = sequences.map(split_input_target)

In [39]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [40]:
# Create Training Batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder = True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [41]:
vocab_size = len(vocab)
embedding_dim = 256

rnn_units = 1024

In [42]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(
            rnn_units,
            return_sequences = True,
            return_state = True
        )
        self.dense = tf.keras.layers.Dense(vocab_size)
    
    def call(self, inputs, states = None, return_state = False, training = False):
        x = inputs
        x = self.embedding(x, training = training)
        
        if states is None:
            states = self.gru.get_initial_state(x)
        else:
            return x

In [44]:
model = MyModel(
    vocab_size = len(ids_from_chars.get_vocabulary()),
    embedding_dim = embedding_dim,
    rnn_units = rnn_units
)

In [46]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

AttributeError: 'NoneType' object has no attribute 'shape'

In [47]:
model.summary()

Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  16896     
_________________________________________________________________


ValueError: You tried to call `count_params` on gru, but the layer isn't built. You can build it manually via: `gru.build(batch_input_shape)`.