In [1]:
import os,time,sys,subprocess,warnings
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
import tensorflow as tf
from pathlib import Path
from tensorflow import keras
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
pio.templates.default = "plotly_dark"
tf.get_logger().setLevel("ERROR")

# Creating the Training Dataset

In [2]:
shake = keras.utils.get_file("shakes.txt",'https://homl.info/shakespeare')
with open(shake) as f:
    shakes = f.read()

In [3]:
print(shakes[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


- encode every character to integer
  - one option to create custom preprocessing layer
  - second is to use keras tokenizer class

In [4]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakes)

In [5]:
tokenizer.texts_to_sequences(["Hello this is mario"])

[[7, 2, 12, 12, 4, 1, 3, 7, 6, 8, 1, 6, 8, 1, 15, 5, 9, 6, 4]]

In [6]:
print("".join(tokenizer.sequences_to_texts(np.random.randint(1,50,size=[1,50]))))

z 
 f c ? 3 g e r a $ v a u j , ' m ! 
 3 ' t y e ? q y l w ' v ? e l ' , y   l


In [7]:
max_id = len(tokenizer.word_index)

In [8]:
total_size = tokenizer.document_count

In [9]:
encoded = np.array(tokenizer.texts_to_sequences(shakes))-1
encoded[:5]

array([[19],
       [ 5],
       [ 8],
       [ 7],
       [ 2]])

In [10]:
train_size = total_size * 90//100
valid_size = total_size - train_size
print(f"train size = {train_size} and valid size = {valid_size}")

train size = 1003854 and valid size = 111540


In [11]:
n_steps = 100
batch_size = 32
shuffle_size = int(1e4)

In [12]:
tf.config.list_logical_devices()
strategy = tf.distribute.OneDeviceStrategy(device="/device:GPU:0")

In [13]:
def create_ds(slices,seq_len=100,batch_size=32,shuffle_size=10000,shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.window(seq_len+1,shift=1,drop_remainder=True).flat_map(lambda x: x.batch(seq_len+1))
    if shuffle:
        ds = ds.shuffle(shuffle_size)
    ds = ds.batch(batch_size)
    ds = ds.map(lambda x: (x[:,:-1],x[:,1:]))
    return ds.prefetch(1)

In [14]:
train_ds = create_ds(encoded[:train_size],shuffle=True)
valid_ds = create_ds(encoded[train_size:train_size+80_000])
test_ds = create_ds(encoded[train_size+80_000:])

In [15]:
# with strategy.scope():
#     model = keras.Sequential()
#     model.add(keras.layers.Embedding(input_dim=max_id,output_dim=16))
#     model.add(keras.layers.GRU(128,return_sequences=True))
#     model.add(keras.layers.Dense(max_id,"softmax"))
#     model.compile(
#         loss="sparse_categorical_crossentropy",
#         optimizer="nadam",
#         metrics=["accuracy"]
#     )
# checkp = keras.callbacks.ModelCheckpoint("shakes_model",monitor="val_accuracy",save_best_only=True)
# model.fit(train_ds,validation_data=valid_ds,epochs=10,callbacks=[checkp])
'''This will take about an hour so skipping and downloading the model'''

'This will take about an hour so skipping and downloading the model'

In [16]:
url = "https://github.com/ageron/data/raw/main/shakespeare_model.tgz"
path = keras.utils.get_file("shakespeare_model.tgz",origin=url,extract=True)
model_path = Path(path).with_name("shakespeare_model")
shakes_model = keras.models.load_model(model_path)

In [17]:
phrase = "To be or not to b"
tokenizer.sequences_to_texts([[tf.argmax(shakes_model.predict([phrase])[0,-1]).numpy()+1]])



['e']

In [19]:
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU')]

In [18]:
def generate_poem(phrase,chars=50,temperature=1):
    new_phrase = phrase
    for i in range(chars):
        new_letter = tokenizer.sequences_to_texts([[tf.argmax(shakes_model.predict([new_phrase])[0,-1]).numpy()+1]])
        new_phrase = new_phrase + new_letter