In [22]:
import os
import time

import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, LSTM

In [2]:
file = '1400-0.txt'
url = f'https://www.gutenberg.org/files/1400/{file}'

In [3]:
path = tf.keras.utils.get_file(file, url)

In [4]:
text = open(path).read()
text = text[1797:] # strip front matter
print(f'length of text: {len(text)} chars:\n{text[:300]}')

length of text: 1012598 chars:

My father’s family name being Pirrip, and my Christian name Philip, my
infant tongue could make of both names nothing longer or more explicit
than Pip. So, I called myself Pip, and came to be called Pip.

I give Pirrip as my father’s family name, on the authority of his
tombstone and my sister,—Mrs


In [5]:
chars = sorted(set(text))
print(f'{len(chars)} unique characters')

92 unique characters


In [6]:
# dict of unique chars to index vals
char_to_index = {c: i for i, c in enumerate(chars)}

In [7]:
index_to_char = np.array(chars)

In [8]:
text_as_int = np.array([char_to_index[char] for char in text])

In [9]:
print(text_as_int[:300])

[ 1 42 83  2 64 59 78 66 63 76 89 77  2 64 59 71 67 70 83  2 72 59 71 63
  2 60 63 67 72 65  2 45 67 76 76 67 74 12  2 59 72 62  2 71 83  2 32 66
 76 67 77 78 67 59 72  2 72 59 71 63  2 45 66 67 70 67 74 12  2 71 83  1
 67 72 64 59 72 78  2 78 73 72 65 79 63  2 61 73 79 70 62  2 71 59 69 63
  2 73 64  2 60 73 78 66  2 72 59 71 63 77  2 72 73 78 66 67 72 65  2 70
 73 72 65 63 76  2 73 76  2 71 73 76 63  2 63 82 74 70 67 61 67 78  1 78
 66 59 72  2 45 67 74 14  2 48 73 12  2 38  2 61 59 70 70 63 62  2 71 83
 77 63 70 64  2 45 67 74 12  2 59 72 62  2 61 59 71 63  2 78 73  2 60 63
  2 61 59 70 70 63 62  2 45 67 74 14  1  1 38  2 65 67 80 63  2 45 67 76
 76 67 74  2 59 77  2 71 83  2 64 59 78 66 63 76 89 77  2 64 59 71 67 70
 83  2 72 59 71 63 12  2 73 72  2 78 66 63  2 59 79 78 66 73 76 67 78 83
  2 73 64  2 66 67 77  1 78 73 71 60 77 78 73 72 63  2 59 72 62  2 71 83
  2 77 67 77 78 63 76 12 87 42 76 77]


In [10]:
# Max allowable seq length (in chars)
seq_len = 100
examples_per_epoch = len(text) // seq_len

In [11]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
# Sanity check
for char in char_dataset.take(10):
    print(f'{char}: {index_to_char[char.numpy()]}')

1: 

42: M
83: y
2:  
64: f
59: a
78: t
66: h
63: e
76: r


In [12]:
seqs = char_dataset.batch(seq_len + 1, drop_remainder=True)

In [13]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [14]:
dataset = seqs.map(split_input_target)

In [15]:
for input_example, target_example in dataset.take(1):
    print('Input data:', 
          repr(''.join(index_to_char[input_example.numpy()])))
    print('Target data:',
          repr(''.join(index_to_char[target_example.numpy()])))

Input data: '\nMy father’s family name being Pirrip, and my Christian name Philip, my\ninfant tongue could make of '
Target data: 'My father’s family name being Pirrip, and my Christian name Philip, my\ninfant tongue could make of b'


In [16]:
for char, (input_index, target_index) in enumerate(
        zip(input_example[:5], target_example[:5])):
    print(f'Step {char:4d}')
    print(f' input: {input_index} ({repr(index_to_char[input_index])})')
    print(f' expected output: {target_index} '
          f'({repr(index_to_char[target_index])})')

Step    0
 input: 1 ('\n')
 expected output: 42 ('M')
Step    1
 input: 42 ('M')
 expected output: 83 ('y')
Step    2
 input: 83 ('y')
 expected output: 2 (' ')
Step    3
 input: 2 (' ')
 expected output: 64 ('f')
Step    4
 input: 64 ('f')
 expected output: 59 ('a')


In [17]:
BATCH = 64
STEPS_PER_EPOCH = examples_per_epoch // BATCH
BUFFER = 10000

In [18]:
dataset = dataset.shuffle(BUFFER).batch(BATCH, drop_remainder=True)
dataset = dataset.repeat()
dataset

<RepeatDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [19]:
VOCAB_LEN = len(chars)
EMBED_DIM = 256
RNN_UNITS = 1024
RNN_CELL = GRU

In [20]:
if tf.config.list_physical_devices('GPU'):
    rnn = tf.compat.v1.keras.layers.CuDNNGRU
    print('Using GPU')
else:
    import functools
    rnn = functools.partial(tf.keras.layers.GRU, 
                            recurrent_activation='sigmoid')
    print('Using CPU')

Using CPU


In [27]:
def build_model(chars, embedding_dim, rnn_units, batch_size):
    model = Sequential(
        [Embedding(
            chars, embedding_dim, batch_input_shape=[batch_size, None]),
         rnn(rnn_units, 
             return_sequences=True, 
             recurrent_initializer='glorot_uniform', 
             stateful=True),
         tf.keras.layers.Dense(len(chars))])
    return model

In [28]:
model = build_model(chars, EMBED_DIM, RNN_UNITS, BATCH)

TypeError: Dimension value must be integer or None or have an __index__ method, got ListWrapper(['\t', '\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ê', 'ô', '—', '‘', '’', '“', '”'])

In [26]:
len(chars)

92