# Setup

In [2]:
import tensorflow as tf
import numpy as np
import os
import time

tf.enable_eager_execution()

## Download Shakespeare dataset

In [4]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


## Read the data

In [8]:
text = open(path_to_file).read()
print("Length of text: {} characters".format(len(text)))
print(text[:250])
vocab = sorted(set(text))
print("{} unique characters".format(len(vocab)))

Length of text: 1115394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

65 unique characters


# Process the text

## Vectorize the text

In [16]:
# Mapping from unique characters to indices
char2idx = { uniq_ch: i for i, uniq_ch in enumerate(vocab)}
idx2char = np.array(vocab)
print(char2idx['c'])
print(idx2char[41])

text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[:10])
print(text[:10])

41
c
[18 47 56 57 58  1 15 47 58 47]
First Citi


In [27]:
print("{")
for char, _ in zip(char2idx, range(20)):
    print("    {:4s}: {:3d},".format(repr(char), char2idx[char]))
print("    ...\n}")

{
    'f' :  44,
    'h' :  46,
    'C' :  15,
    'l' :  50,
    ' ' :   1,
    'i' :  47,
    'z' :  64,
    '\n':   0,
    'N' :  26,
    'u' :  59,
    'r' :  56,
    '!' :   2,
    'Q' :  29,
    'c' :  41,
    '&' :   4,
    'y' :  63,
    'P' :  28,
    'Y' :  37,
    'F' :  18,
    '.' :   8,
    ...
}


In [31]:
print("{} --- char2idx ---> {}".format(repr(text[:13]), text_as_int[:13]))

'First Citizen' --- char2idx ---> [18 47 56 57 58  1 15 47 58 47 64 43 52]


## Prediction tast
Given a character, or sequence of characters, what is the most probable next character?
## Create training examples and targets
For each input sequence, corresponding targets contain the same length of text, except shifted one character to the right.

In [123]:
# Convert text vector into a stream of character indices
# Maximum length sentence for a single input in characters
seq_len = 100
ex_per_epoch = len(text) // seq_len
print(ex_per_epoch)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(13):
    print(idx2char[i.numpy()])

11153
F
i
r
s
t
 
C
i
t
i
z
e
n


In [69]:
# use batch method to convert individual characters to sequences of 'seq_len'
sequences = char_dataset.batch(seq_len + 1, drop_remainder=True)

for item in sequences.take(5):
    print("***\n{}\n***\n".format(''.join(idx2char[item])))

***
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 
***

***
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k
***

***
now Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us ki
***

***
ll him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be d
***

***
one: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citi
***



In [82]:
# Duplicate and shift each sequence to form the input + target
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

for y, t in dataset.take(1):
    print("y: {}".format(repr(''.join(idx2char[y.numpy()]))))
    print("t: {}".format(repr(''.join(idx2char[t.numpy()]))))

y: 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
t: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [118]:
# Observe how the data will be processed by the model
first_5 = [(y[:5], t[:5]) for y, t in dataset.take(1)]
for i, (y, t) in enumerate(zip(first_5[0][0], first_5[0][1])):
    print("Step {:4d}".format(i))
    print("    input: {} ({:s})".format(y, repr(idx2char[y])))
    print("    expected output: {} ({:s})".format(t, repr(idx2char[t])))

Step    0
    input: 18 ('F')
    expected output: 47 ('i')
Step    1
    input: 47 ('i')
    expected output: 56 ('r')
Step    2
    input: 56 ('r')
    expected output: 57 ('s')
Step    3
    input: 57 ('s')
    expected output: 58 ('t')
Step    4
    input: 58 ('t')
    expected output: 1 (' ')


## Create training batches
Shuffle the data and pack it into batches

In [127]:
# batch size
BATCH_SIZE = 64
steps_per_epoch = ex_per_epoch // BATCH_SIZE
print(steps_per_epoch)

# TF doesn't shuffle the entire sequence, just the amount that fits in the buffer
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

174
<BatchDataset shapes: ((64, 64, 100), (64, 64, 100)), types: (tf.int32, tf.int32)>
