In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import tensorflow as tf
from tensorflow import keras

# Este notebook require Tensorflow 2.X

In [3]:
print(tf.__version__)

2.1.0


### Descargar Extracto der Shakespeare 

In [4]:
jokes = ""

joke_url = "https://raw.githubusercontent.com/Daronspence/One-Liners/master/jokes.txt"
filepath = keras.utils.get_file("test.txt", joke_url)
with open(filepath) as f:
    joke_text = f.read()

jokes += joke_text

joke_url = "http://www.textfiles.com/humor/TAGLINES/taglines.txt"
filepath = keras.utils.get_file("test.txt", joke_url)
with open(filepath) as f:
    joke_text = f.read()
    
jokes += joke_text

joke_url = "https://raw.githubusercontent.com/simonaco/25daysofserverless/master/jokes.txt"
filepath = keras.utils.get_file("test.txt", joke_url)
with open(filepath) as f:
    joke_text = f.read()
    
jokes += joke_text

### Set de caracteres en la obra (FYI):

In [5]:
"".join(sorted(set(jokes.lower())))

'\n !"$%&\',-.01234589:;=?abcdefghijklmnopqrstuvwxyz'

In [6]:
# remove caracteres extranos.

jokes = jokes.replace("\n", " ")
jokes = jokes.replace("$", " ")
jokes = jokes.replace('"', " ")
jokes = jokes.replace('%', " ")
jokes = jokes.replace('&', " ")
jokes = jokes.replace("'", " ")
jokes = jokes.replace("-", " ")
jokes = jokes.replace(";", "")
jokes = jokes.replace(":", "")
jokes = jokes.replace("=", "")
jokes = jokes.replace(",", "")
                      
"".join(sorted(set(jokes.lower())))

' !.01234589?abcdefghijklmnopqrstuvwxyz'

### Tokenizacion del texto con Keras

In [7]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(jokes)

# probamos la palabra
tokenizer.texts_to_sequences(["speak"])

[[8, 21, 2, 5, 24]]

In [8]:
# podemos converir esa secuencia en texto:
tokenizer.sequences_to_texts([[8, 21, 2, 5, 24]])

['s p e a k']

In [9]:
# numero de caracteres distintos
max_id = len(tokenizer.word_index) 

# cantidad total del caracteres
dataset_size = tokenizer.document_count

print("Elementos distintos",max_id)
print("Cantidad de elementos",dataset_size)

Elementos distintos 38
Cantidad de elementos 142305


### Creamos el Train-Set usando las Funciones de NLP (ver Notebook #1)

In [10]:
# se convierte TODO el texto en secuencias.
[encoded] = np.array(tokenizer.texts_to_sequences([jokes])) - 1

# se genera un train-set del 90% de las secuencias
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [11]:
# se define que las cadenas son de 100 caracteres con 1 caracter de shift
n_steps = 100
window_length = n_steps + 1 # tamano de la ventana
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

# se generan las cadenas planas
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [12]:
# generamos el x_train y y_train usando el mismo codigo del notebook #1
np.random.seed(42)
tf.random.set_seed(42)
batch_size = 32

encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []

for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)

dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda x_train, y_train: (tf.one_hot(x_train, depth=max_id), y_train))
dataset = dataset.prefetch(1)

# se revisan los tamanos de los tensores generados
for x, y in dataset.take(1):
    print(x.shape, y.shape)

(32, 100, 38) (32, 100)


In [13]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

### Modelo Stateful RNN-GRU y Entrenamiento

In [14]:
# arquitecturta
model = keras.models.Sequential([
    keras.layers.GRU(1024, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2,
                     batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(512, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,activation="softmax"))
])

# compilacion y entrenamiento
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
steps_per_epoch = train_size // batch_size // n_steps
history = model.fit(dataset, steps_per_epoch=steps_per_epoch, epochs=400,
                    callbacks=[ResetStatesCallback()])

Train for 40 steps
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Ep

Epoch 195/400
Epoch 196/400
Epoch 197/400
Epoch 198/400
Epoch 199/400
Epoch 200/400
Epoch 201/400
Epoch 202/400
Epoch 203/400
Epoch 204/400
Epoch 205/400
Epoch 206/400
Epoch 207/400
Epoch 208/400
Epoch 209/400
Epoch 210/400
Epoch 211/400
Epoch 212/400
Epoch 213/400
Epoch 214/400
Epoch 215/400
Epoch 216/400
Epoch 217/400
Epoch 218/400
Epoch 219/400
Epoch 220/400
Epoch 221/400
Epoch 222/400
Epoch 223/400
Epoch 224/400
Epoch 225/400
Epoch 226/400
Epoch 227/400
Epoch 228/400
Epoch 229/400
Epoch 230/400
Epoch 231/400
Epoch 232/400
Epoch 233/400
Epoch 234/400
Epoch 235/400
Epoch 236/400
Epoch 237/400
Epoch 238/400
Epoch 239/400
Epoch 240/400
Epoch 241/400
Epoch 242/400
Epoch 243/400
Epoch 244/400
Epoch 245/400
Epoch 246/400
Epoch 247/400
Epoch 248/400
Epoch 249/400
Epoch 250/400
Epoch 251/400
Epoch 252/400
Epoch 253/400
Epoch 254/400
Epoch 255/400
Epoch 256/400
Epoch 257/400
Epoch 258/400
Epoch 259/400
Epoch 260/400
Epoch 261/400
Epoch 262/400
Epoch 263/400
Epoch 264/400
Epoch 265/400
Epoch 

Epoch 387/400
Epoch 388/400
Epoch 389/400
Epoch 390/400
Epoch 391/400
Epoch 392/400
Epoch 393/400
Epoch 394/400
Epoch 395/400
Epoch 396/400
Epoch 397/400
Epoch 398/400
Epoch 399/400
Epoch 400/400


In [15]:
# esto es un super hack!
stateless_model = keras.models.Sequential([
    keras.layers.GRU(1024, return_sequences=True, input_shape=[None, max_id]),
    keras.layers.GRU(512, return_sequences=True),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

stateless_model.build(tf.TensorShape([None, None, max_id]))
stateless_model.set_weights(model.get_weights())

In [16]:
model = stateless_model

# Guardar el Modelo
model.save('jokes.h5')