# GRU

Use [TensorFlow](https://www.tensorflow.org/tutorials/text/text_generation) to train a word level GRU on the preprocessed Recipe Box dataset.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [33]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pathlib
import os
import re

from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [4]:
CACHE_DIR = './drive/Shared drives/Capstone/tmp'
pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)
dataset_path = os.path.join(CACHE_DIR, 'emoji_text_recipes.pkl')

In [56]:
if not os.path.exists(dataset_path):
    raise SystemExit("Run preprocess_rnn_word.ipynb to generate data file before continuing")
else:
    recipes = pd.read_pickle(dataset_path)

In [None]:
recipes.head()

0    🍴 Slow Cooker Chicken and Dumplings\n\n🥑\n• 4 ...
1    🍴 Awesome Slow Cooker Pot Roast\n\n🥑\n• 2 (10....
2    🍴 Brown Sugar Meatloaf\n\n🥑\n• 1/2 cup packed ...
3    🍴 Best Chocolate Chip Cookies\n\n🥑\n• 1 cup bu...
4    🍴 Homemade Mac and Cheese Casserole\n\n🥑\n• 8 ...
dtype: object

## Vectorize the text

In [None]:
# This makes the dataset tiny
# recipes = recipes[:100]

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [57]:
to_exclude = '#$&*/<=>@[\\]^_`{|}~\t'
to_tokenize = '.,:;!?"-+()%\'\n'

def separate_punct(text):
    return re.sub(r'(['+to_tokenize+'])', r' \1 ', text)

recipes = recipes.progress_apply(lambda recipe: separate_punct(recipe))

100%|██████████| 105789/105789 [00:06<00:00, 16891.58it/s]


In [58]:
recipes[17]

"🍴 World ' s Best Lasagna \n  \n 🥑 \n • 1 pound sweet Italian sausage \n • 3/4 pound lean ground beef \n • 1/2 cup minced onion \n • 2 cloves garlic ,  crushed \n • 1  ( 28 ounce )  can crushed tomatoes \n • 2  ( 6 ounce )  cans tomato paste \n • 2  ( 6 . 5 ounce )  cans canned tomato sauce \n • 1/2 cup water \n • 2 tablespoons white sugar \n • 1 1/2 teaspoons dried basil leaves \n • 1/2 teaspoon fennel seeds \n • 1 teaspoon Italian seasoning \n • 1 tablespoon salt \n • 1/4 teaspoon ground black pepper \n • 4 tablespoons chopped fresh parsley \n • 12 lasagna noodles \n • 16 ounces ricotta cheese \n • 1 egg \n • 1/2 teaspoon salt \n • 3/4 pound mozzarella cheese ,  sliced \n • 3/4 cup grated Parmesan cheese \n  \n 🥣 \n ‣ In a Dutch oven ,  cook sausage ,  ground beef ,  onion ,  and garlic over medium heat until well browned .  Stir in crushed tomatoes ,  tomato paste ,  tomato sauce ,  and water .  Season with sugar ,  basil ,  fennel seeds ,  Italian seasoning ,  1 tablespoon salt ,  

In [59]:
# tokenizer = Tokenizer(char_level=False, filters='', lower=False, split=' ', oov_token='<UNK>')
tokenizer = Tokenizer(char_level=False, filters=to_exclude, lower=True, split=' ', oov_token='<UNK>')
tokenizer.fit_on_texts(recipes)

In [60]:
encoded_recipes = tokenizer.texts_to_sequences(recipes)

In [None]:
encoded_recipes[0]

In [62]:
recipe_maxlen = max([len(r) for r in encoded_recipes])
recipe_maxlen

537

In [63]:
padded_recipes = pad_sequences(encoded_recipes,
                               padding='post',
                               truncating='post',
                               maxlen=recipe_maxlen+1) # Guarantee at least 1 padding character at end

In [64]:
padded_recipes

array([[  35,  591,  635, ...,    0,    0,    0],
       [  35, 4143,  591, ...,    0,    0,    0],
       [  35,   77,   38, ...,    0,    0,    0],
       ...,
       [  35,  446,  996, ...,    0,    0,    0],
       [  35,  137,  216, ...,    0,    0,    0],
       [  35, 7986, 3290, ...,    0,    0,    0]], dtype=int32)

In [65]:
vocab_size = len(tokenizer.word_counts) + 2
idx2word = tokenizer.sequences_to_texts([[idx] for idx in range(vocab_size)])


In [None]:
idx2word

In [69]:
idx2word[635]

'cooker'

In [70]:
print(vocab_size)
print(len(padded_recipes))

46017
105789


In [76]:
tokenizer.word_counts['punch']

1588

In [82]:
len([word for word in tokenizer.word_counts.items() if word[1] > 5])

13108

# Create training examples and targets

In [83]:
dataset = tf.data.Dataset.from_tensor_slices(padded_recipes)

In [84]:
# For each sequence, duplicate and shift it to form the input and target text
def split_input_target(recipe):
    input_text = recipe[:-1]
    target_text = recipe[1:]
    return input_text, target_text

dataset = dataset.map(split_input_target)

In [85]:
BATCH_SIZE = 1 # ONLINE LEARNING NOW - I hope this works
BUFFER_SIZE = 1000  # Buffer size to shuffle the dataset

# Create training batches
dataset = dataset \
    .shuffle(BUFFER_SIZE) \
    .batch(BATCH_SIZE, drop_remainder=True) \
    .repeat() # Repeat to have enough data during training

# Build the model

In [86]:
def build_model(vocab_size, embed_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embed_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer='glorot_uniform'
        ),
        tf.keras.layers.Dense(vocab_size)
    ])

    return model

In [88]:
EMBED_DIM = 256
RNN_UNITS = 512
MODEL_NAME = "rnn_word_punct_emoji_online"

In [89]:
model = build_model(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    rnn_units=RNN_UNITS,
    batch_size=BATCH_SIZE)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (1, None, 256)            11780352  
_________________________________________________________________
gru (GRU)                    (1, None, 512)            1182720   
_________________________________________________________________
dense (Dense)                (1, None, 46017)          23606721  
Total params: 36,569,793
Trainable params: 36,569,793
Non-trainable params: 0
_________________________________________________________________


In [90]:
from keras.utils.vis_utils import plot_model
plot_model(model, show_shapes=True)

AssertionError: ignored

In [91]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape) # (batch_size, sequence_length, vocab_size)

(1, 537, 46017)


In [92]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    
example_batch_loss = loss(target_example_batch, example_batch_predictions)
prediction_shape = example_batch_predictions.shape
scalar_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", prediction_shape)
print("scalar_loss:      ", scalar_loss)
print("exp(scalar_loss): ", np.exp(scalar_loss))
print("vocab size      : ", vocab_size)
print("If all went right, exp(scalar loss) should be approximately equal to vocab size")

Prediction shape:  (1, 537, 46017)
scalar_loss:       10.73614
exp(scalar_loss):  45988.207
vocab size      :  46017
If all went right, exp(scalar loss) should be approximately equal to vocab size


In [93]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    patience=5,
    monitor='loss',
    restore_best_weights=True,
    verbose=1
)

In [94]:
# Create a checkpoints directory.
checkpoint_dir = os.path.join(CACHE_DIR, MODEL_NAME)
os.makedirs(checkpoint_dir, exist_ok=True)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'ckpt_{epoch}'),
    save_weights_only=True
)

In [95]:
def restore_checkpoint(model):
    latest_checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)

    if not latest_checkpoint_path:
        print('Checkpoint not found')
        return model, 0

    print("Checkpoint found")
    print('Path:', latest_checkpoint_path)

    model.load_weights(latest_checkpoint_path)

    latest_checkpoint_name = os.path.split(latest_checkpoint_path)[-1]
    print('Name:', latest_checkpoint_name)

    latest_epoch = latest_checkpoint_name.split('_')[-1]
    print('Epoch:', latest_epoch)

    return model, int(latest_epoch)

In [96]:
TOTAL_EPOCHS = 10
STEPS_PER_EPOCH = 1500

model, initial_epoch = restore_checkpoint(model)

Checkpoint not found


In [97]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=loss
)

## Train model

In [98]:
history = model.fit(
    x=dataset,
    epochs=TOTAL_EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    initial_epoch=initial_epoch,
    verbose=True,
    callbacks=[
        checkpoint_callback,
        early_stopping_callback
    ]
)

# Saving the trained model to file (to be able to re-use it later).
model_name = os.path.join(CACHE_DIR, MODEL_NAME, f'{MODEL_NAME}.h5')
model.save(model_name, save_format='h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Predictions

In [102]:
def generate_recipe(model, start_string, num_generate=1000, temperature=0.8):
    TITLE_START = "🍴 "
    # Evaluation step (generating text using the learned model)

    padded_start_string = TITLE_START + start_string

    # Converting our start string to numbers (vectorizing).
    input_eval = np.array(tokenizer.texts_to_sequences([padded_start_string]))

    # Empty string to store our results.
    text_generated = []

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the token returned by the model
        predictions = predictions / temperature

        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        if predicted_id == 0: # stop if we start generating the padding token
            break

        # Pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2word[predicted_id])

    return (padded_start_string  + ' '.join(text_generated))

In [103]:
# Restore latest checkpoint and change batch to 1
model = build_model(vocab_size, EMBED_DIM, RNN_UNITS, batch_size=1)
model, _ = restore_checkpoint(model)
model.build(tf.TensorShape([1, None]))

Checkpoint found
Path: ./drive/Shared drives/Capstone/tmp/rnn_word_punct_emoji_online/ckpt_10
Name: ckpt_10
Epoch: 10


In [104]:
generated_text = generate_recipe(model, start_string="Slow Cooked Chicken 🥑", num_generate=500, temperature=0.7)
print(generated_text)

🍴 Slow Cooked Chicken 🥑
 • 1 3 cup white sugar 
 • 1 2 egg , beaten 
 • 1 cup heavy cream , divided 
 • 1 ( 6 ounce ) can pineapple with juice 
 
 🥣 
 ‣ mix cream cheese , salt , and pepper in a bowl until evenly combined . 
 ‣ add the diced tomatoes , celery , carrots , chicken , apple , and chicken with sour cream . mix well . pour 1 cup of chicken stock into the skillet and bring to a boil . reduce heat to medium , and cook 3 minutes , stirring occasionally . remove from heat , and allow to rest for 30 minutes . 
 ‣ pour the liquid into sterilized jars . return to skillet for 1 hour . the sauce should be covered with the liquid . repeat with remaining chicken broth if you like . 
 ‣ return the beans to the pot , and cook for 2 minutes , or until all of the liquid is bubbling . remove from heat , make sure not much the bone . 
 ‣ save the jars . spread the chicken broth over the pineapple . serve top .


# Generate evaluation recipes

In [105]:
# Set to True for generating recipes for evaluation
GENERATE_EVAL_RECIPES = True

In [109]:
import random
from tqdm import tqdm

if GENERATE_EVAL_RECIPES:
    col_recipes = pd.read_pickle('/content/drive/Shared drives/Capstone/tmp/recipes.pkl')
    recipe_titles = col_recipes.filter(['title']).values.ravel().tolist()
    # recipe_words = set()

    # Create vocabulary of words used in recipe titles as potential inputs for generation
    # for title in recipe_titles:
    #     for word in title.split(' '):
    #         if len(word) > 0: # Don't add empty string
    #             recipe_words.add(word)

    eval_output_dir = os.path.join(CACHE_DIR, "rnn_word_punct_emoji_online_title_prompt_output_recipes")
    pathlib.Path(eval_output_dir).mkdir(exist_ok=True)
    
    # Generate N recipes using random title as input to model
    N = 500
    used_titles = []

    for i in tqdm(range(N)):
        keyword = random.choice(tuple(recipe_titles))
        used_titles.append(keyword)
        output_file_name = os.path.join(eval_output_dir, f"rnn_word_recipe_{i}.txt")

        start_string = keyword + "\n\n 🥑 "

        r = generate_recipe(model, start_string=start_string, temperature=0.8)
        with open(output_file_name, 'w') as f:
            f.write(r)
    
    # Save used titles for later reference
    title_list_file = os.path.join(eval_output_dir, f"titles.txt")
    with open(title_list_file, 'w') as f:
        f.writelines("%s\n" % t for t in used_titles)


  0%|          | 0/500 [00:00<?, ?it/s][A
  0%|          | 1/500 [00:00<06:31,  1.27it/s][A
  0%|          | 2/500 [00:01<07:03,  1.18it/s][A
  1%|          | 3/500 [00:03<08:16,  1.00it/s][A
  1%|          | 4/500 [00:04<10:07,  1.22s/it][A
  1%|          | 5/500 [00:05<09:28,  1.15s/it][A
  1%|          | 6/500 [00:06<07:50,  1.05it/s][A
  1%|▏         | 7/500 [00:08<10:09,  1.24s/it][A
  2%|▏         | 8/500 [00:09<09:52,  1.20s/it][A
  2%|▏         | 9/500 [00:10<08:48,  1.08s/it][A
  2%|▏         | 10/500 [00:11<09:04,  1.11s/it][A
  2%|▏         | 11/500 [00:12<09:24,  1.15s/it][A
  2%|▏         | 12/500 [00:13<07:59,  1.02it/s][A
  3%|▎         | 13/500 [00:13<06:55,  1.17it/s][A
  3%|▎         | 14/500 [00:14<06:00,  1.35it/s][A
  3%|▎         | 15/500 [00:15<06:40,  1.21it/s][A
  3%|▎         | 16/500 [00:15<05:49,  1.39it/s][A
  3%|▎         | 17/500 [00:16<05:26,  1.48it/s][A
  4%|▎         | 18/500 [00:16<04:50,  1.66it/s][A
  4%|▍         | 19/500 [00:1

In [None]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x7f49e0529490>