UNIVERSIDAD AUTONOMA DE CHIHUAHUA
Facultad de Ingeniería
Shakespeare
Alejandro Arturo González Flores
8CC2
Matricula: 348552
Docente: Jesus Roberto López Santillán
Data Science
Chihuahua, Chih. 28 de Noviembre del 2023


In [1]:
import tensorflow as tf
from tensorflow import keras

In [2]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

In [3]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([shakespeare_text])

In [4]:
print(tokenizer.texts_to_sequences(["First"]))
print(tokenizer.sequences_to_texts([[20,6,9,8,3]]))
max_id = len(tokenizer.word_index)
print(max_id)
dataset_size=sum(x for x in tokenizer.word_counts.values())
print(type(dataset_size))
print(dataset_size)

[[20, 6, 9, 8, 3]]
['f i r s t']
39
<class 'int'>
1115394


In [5]:
import numpy as np

In [6]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text]))-1
print(encoded)

[19  5  8 ... 20 26 10]


In [7]:
train_size=dataset_size * 90 //100
dataset=tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [8]:
n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [9]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [10]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[: ,:-1], windows[:, 1:]))

In [11]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth = max_id), Y_batch)
)
dataset = dataset.prefetch(1)

In [12]:
'''
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None,max_id],dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history=model.fit(dataset, epochs=20)
model.save('my_model.h5')
'''

'\nmodel = keras.models.Sequential([\n    keras.layers.GRU(128, return_sequences=True, input_shape=[None,max_id],dropout=0.2),\n    keras.layers.GRU(128, return_sequences=True,\n                     #dropout=0.2, recurrent_dropout=0.2),\n                     dropout=0.2),\n    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))\n])\nmodel.compile(loss="sparse_categorical_crossentropy", optimizer="adam")\nhistory=model.fit(dataset, epochs=20)\nmodel.save(\'my_model.h5\')\n'

In [13]:
model = tf.keras.models.load_model('my_model.h5')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, None, 128)         64896     
                                                                 
 gru_1 (GRU)                 (None, None, 128)         99072     
                                                                 
 time_distributed (TimeDist  (None, None, 39)          5031      
 ributed)                                                        
                                                                 
Total params: 168999 (660.15 KB)
Trainable params: 168999 (660.15 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
def preprocess(texts):
  X = np.array(tokenizer.texts_to_sequences(texts))-1
  return tf.one_hot(X, max_id)

In [15]:
X_new = preprocess(["How are yo"])
Y_pred_prob = model.predict(X_new)
Y_pred_classes = np.argmax(Y_pred_prob, axis=-1)

predicted_text = tokenizer.sequences_to_texts(Y_pred_classes + 1)[0][-1]



In [16]:
def next_char(text, temperature=1):
  X_new = preprocess([text])
  y_proba = model.predict(X_new)[0, -1:, :]
  rescaled_logits = tf.math.log(y_proba)/temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1) +1
  return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [17]:
def complete_text(text, n_chars=50, temperature=1):
  for _ in range(n_chars):
    text += next_char(text,temperature)
  return text

In [18]:
print(complete_text("t", temperature=0.2))
print(complete_text("w", temperature=1))
print(complete_text("w", temperature=2))

the men and will i see
the more than she is an any 
why,
that's the men or knaves when he lives to you;
whise vyrota! ond
bolk, ser?,
at if your.
geccala f


In [19]:
tf.random.set_seed(42)
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
  dataset= tf.data.Dataset.from_tensor_slices(encoded_part)
  dataset= dataset.window(window_length, shift=n_steps, drop_remainder=True)
  dataset= dataset.flat_map(lambda window: window.batch(window_length))
  datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)
dataset= dataset.prefetch(1)

In [20]:
stateful_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [21]:
class ResetStatesCallback(keras.callbacks.Callback):
  def on_epoch_begin(self, epoch, logs):
    self.model.reset_states()

In [22]:
stateful_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
stateful_model.fit(dataset,epochs=50, callbacks=[ResetStatesCallback()])
stateful_model.save('my_stateful_model.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(


In [23]:
stateless_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [24]:
stateless_model.build(tf.TensorShape([None,None,max_id]))
stateless_model.set_weights(model.get_weights())
stateless_model.save('my_stateless_model.h5')



In [25]:
tf.random.set_seed(42)
print(complete_text("t"))

thing idding hanour.

grumio:
will you go? woy, sir
