In [12]:
import pandas as pd
import numpy as np 
## Reading and processing text
with open('1268-0.txt', 'r') as fp:
    text=fp.read()
    

In [13]:
start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')
text = text[start_indx:end_indx]
char_set = set(text)
print('Total Length:', len(text))

Total Length: 1112350


In [14]:
print('Unique Characters:', len(char_set))

Unique Characters: 80


In [15]:
#However, most NN libraries and RNN implementations cannot deal with input data in string format
#convert the text into a numeric format
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

text_encoded = np.array( [char2int[ch] 
                          for ch in text], dtype=np.int32)
print('Text encoded shape:', text_encoded.shape)

Text encoded shape: (1112350,)


In [16]:
print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reverse ==>',' '.join(char_array[text_encoded[15:21]]))

THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] == Reverse ==> I S L A N D


In [17]:
#create a TensorFlow dataset from this array
import tensorflow as tf
ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)
for ex in ds_text_encoded.take(5):
    print('{} -> {}'.format(ex.numpy(), char_array[ex.numpy()]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [18]:
seq_length = 40
chunk_size = seq_length + 1
ds_chunks = ds_text_encoded.batch(chunk_size,
                                  drop_remainder=True)
## define the function for splitting x & y
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq
ds_sequences = ds_chunks.map(split_input_target)

In [19]:
for example in ds_sequences.take(2): 
    print(' Input (x): ',repr(''.join(char_array[example[0].numpy()]))) 
    print('Target (y): ',repr(''.join(char_array[example[1].numpy()])))
    print()

 Input (x):  'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'
Target (y):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'

 Input (x):  ' Anthony Matonak, and Trevor Carlson\n\n\n\n'
Target (y):  'Anthony Matonak, and Trevor Carlson\n\n\n\n\n'



In [20]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000
ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [22]:
#Building a character-level RNN model
def build_model(vocab_size, embedding_dim,rnn_units):
    model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, embedding_dim), 
                                 tf.keras.layers.LSTM(rnn_units,
                                                      return_sequences=True), 
                                 tf.keras.layers.Dense(vocab_size) ])
    return model
## Setting the training parameters

charset_size = len(char_array) 
embedding_dim = 256
rnn_units = 512
tf.random.set_seed(1)
model = build_model(
vocab_size=charset_size, embedding_dim=embedding_dim, rnn_units=rnn_units)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         20480     
_________________________________________________________________
lstm (LSTM)                  (None, None, 512)         1574912   
_________________________________________________________________
dense (Dense)                (None, None, 80)          41040     
Total params: 1,636,432
Trainable params: 1,636,432
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(
optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True ))
model.fit(ds, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20