# Dreaming Daily Mail

#### by Frank Piva (code adapted from Aurélien Geron)
#### November 21, 2020

## Introduction
This notebook demonstrates the construction of a dreaming, recurrent neural net (RNN) from Daily Mail articles. Much of the code is taken from Aurélien Geron's book, "Hands-On Machine Learning".

## Environment Initialization

In [1]:
from tensorflow import keras

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import sklearn
import string
import sys
import tensorflow as tf
import tensorflow_datasets as tfds

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# to plot pretty figures
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

## Loading the Data

In [2]:
data, info = tfds.load(name='cnn_dailymail', with_info=True)

# Prefetching overlaps the preprocessing and model execution of a training
# step. While the model executing training step s, the input pipeline is
# reading data for step s+1.
# The number of elements to prefetch should be equal to (or possibly greater
# than) the number of batches consumed by a single training step. You could 
# either manually tune this value, or set it to tf.data.experimental.AUTOTUNE 
# which will prompt the tf.data runtime to tune the value dynamically at runtime.

data_train = data['train'].prefetch(tf.data.experimental.AUTOTUNE)
X_train = [record['article'].numpy() for record in data_train]

# find the article with the longest text
index, maximum = 0, 0
for element, article in enumerate(X_train):
    article_length = len(article.decode('utf-8'))
    if article_length > maximum:
        maximum = article_length
        index = element    

print(f"[INFO] len(X_train): {len(X_train)}")
print(f"[INFO] index: {index}")
print(f"[INFO] maximum: {maximum}")

article_text = X_train[index]
# print(f"[DEBUG] type(article_text): {type(article_text)}")
article_text = article_text.decode('utf-8')
# print(f"[DEBUG] type(article_text): {type(article_text)}")
print(f"[INFO] article_text: {article_text}")

[INFO] len(X_train): 287113
[INFO] index: 51997
[INFO] maximum: 15925
[INFO] article_text: By . Louise Boyle . PUBLISHED: . 09:45 EST, 28 February 2014 . | . UPDATED: . 13:31 EST, 5 March 2014 . A chemical used to make yoga mats has been found in almost 500 food items, according to a study released this week. The compound azodicarbonamide (ADA) is typically used in bread products to bleach flour and make the dough stronger while keeping it light and spongy. The additive is also an ingredient of shoe rubber and attic insulation, according to research released on Thursday by The Environmental Working Group (EWG). ADA features in foods made by at least 130 American brands including Jimmy Dean, Sara Lee, Pillsbury, White Castle, Marie Calledar's and Country Hearth. Scroll down for video . What's on your plate? A chemical called azodicarbonamide (ADA) which is used to make yoga mats (left) has also been found in close to 500 food products including many 'healthy' breads, salad croutons and 

## Preparing the Tokenizer

In [3]:
# print the set of all unique characters that exist in the text
# NOTE: capitilization is lost during tokenization
unique_characters = ''.join(sorted(set(article_text.lower())))
print(f"[INFO] unique_characters: {unique_characters}")

tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(article_text)

# print a sample to illustrate tokenziation
text = ['First']
text_tts = tokenizer.texts_to_sequences(text)
print(f"[INFO] {text} => {text_tts}")
text_stt = tokenizer.sequences_to_texts(text_tts)
print(f"[INFO] {text_tts} => {text_stt}")

# the results of this code on "Alice's Adventures in Wonderland"
# !"'()*,-.:;?[]_abcdefghijklmnopqrstuvwxyz﻿
# ['First'] => [[22, 6, 10, 9, 3]]
# [[22, 6, 10, 9, 3]] => ['f i r s t']

[INFO] unique_characters:  %&'(),-./01234589:?abcdefghijklmnopqrstuvwxyz| 
[INFO] ['First'] => [[22, 6, 4, 5, 7]]
[INFO] [[22, 6, 4, 5, 7]] => ['f i r s t']


## Preparing the Dataset

In [4]:
# https://github.com/tensorflow/tensorflow/issues/38064
# if validation set has less samples than hyperparameters error will occur
# UnboundLocalError: local variable 'logs' referenced before assignment
batch_size = 16 # 32
dataset_size = tokenizer.document_count # total number of characters
max_id = len(tokenizer.word_index) # number of distinct characters
n_steps = 100
train_size = dataset_size * 90 // 100
window_length = n_steps + 1 # target = input shifted 1 character ahead

[encoded] = np.array(tokenizer.texts_to_sequences([article_text])) - 1
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

for X_batch, Y_batch in dataset.take(1):
    print(f"[INFO] X_batch.shape: {X_batch.shape}")
    print(f"[INFO] Y_batch.shape: {Y_batch.shape}")

[INFO] X_batch.shape: (16, 100, 48)
[INFO] Y_batch.shape: (16, 100)


## Stateful RNN

In [5]:
datasets = []
encoded_parts = np.array_split(encoded[:train_size], batch_size)

dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.repeat().batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length, drop_remainder=True, shift=n_steps)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
    
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

model = keras.models.Sequential()
model.add(keras.layers.GRU(128, batch_input_shape=[batch_size, None, max_id], dropout=0.2, recurrent_dropout=0.2, return_sequences=True, stateful=True))
model.add(keras.layers.GRU(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True, stateful=True))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax')))


class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

        
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
steps_per_epoch = train_size // batch_size // n_steps
history = model.fit(dataset, callbacks=[ResetStatesCallback()], epochs=50, steps_per_epoch=steps_per_epoch)
# history = model.fit(dataset, callbacks=[ResetStatesCallback()], epochs=5, steps_per_epoch=steps_per_epoch)

stateless_model = keras.models.Sequential()
stateless_model.add(keras.layers.GRU(128, input_shape=[None, max_id], return_sequences=True))
stateless_model.add(keras.layers.GRU(128, return_sequences=True))
stateless_model.add(keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax')))

stateless_model.build(tf.TensorShape([None, None, max_id]))
stateless_model.set_weights(model.get_weights())
model = stateless_model

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## Using the Model to Generate Text

In [6]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text


def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]


def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)


# print random samples to illustrate char_id calculation
random_samples = tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=30).numpy()
print(f"[INFO] random_samples: {random_samples}")

# if the model is good, this should print "u", the missing letter
X_new = preprocess(['How are yo'])
Y_pred = model.predict_classes(X_new)
last_charcter = tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]
print(f"[INFO] last_charcter: <{last_charcter}>")

# print some sample text with varying temperatures
print(f"[INFO] next_char('How are yo', temperature=1): {next_char('How are yo', temperature=1)}")
print(f"[INFO] complete_text('t', temperature=0.2): {complete_text('t', temperature=0.2)}")
print(f"[INFO] complete_text('t', temperature=1): {complete_text('t', temperature=1)}")
print(f"[INFO] complete_text('t', temperature=2): {complete_text('t', temperature=2)}")

[INFO] random_samples: [[0 0 1 0 0 0 0 0 2 0 1 0 0 0 0 2 1 0 0 1 1 1 0 0 0 0 0 1 0 1]]
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


[INFO] last_charcter: <g>
[INFO] next_char('How are yo', temperature=1): a
[INFO] complete_text('t', temperature=0.2): ty partich bread/ sandwich bread/ sandwiched bread/
[INFO] complete_text('t', temperature=1): t'53  itho bread/ hot whole steeds/ prr dish, finsa
[INFO] complete_text('t', temperature=2): t/(|ogafir poot1 texfblean:o:l,ex d  & salebustadri
