# GRU

Use [TensorFlow](https://www.tensorflow.org/tutorials/text/text_generation) to train a word level GRU on the preprocessed Recipe Box dataset.

The output of this model will be a dataset of title+ingredients text files, the premise-generating (first) step of our hierarchical generation system whose elaboration (second) step is our Checklist model.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pathlib
import os
import re

from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [None]:
CACHE_DIR = './drive/Shared drives/Capstone/tmp'
pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)
dataset_path = os.path.join(CACHE_DIR, 'emoji_text_recipes.pkl')

In [None]:
if not os.path.exists(dataset_path):
    raise SystemExit("Run preprocess_rnn_word.ipynb to generate data file before continuing")
else:
    recipes = pd.read_pickle(dataset_path)

In [None]:
recipes.head()

0    🍴 Slow Cooker Chicken and Dumplings\n\n🥑\n• 4 ...
1    🍴 Awesome Slow Cooker Pot Roast\n\n🥑\n• 2 (10....
2    🍴 Brown Sugar Meatloaf\n\n🥑\n• 1/2 cup packed ...
3    🍴 Best Chocolate Chip Cookies\n\n🥑\n• 1 cup bu...
4    🍴 Homemade Mac and Cheese Casserole\n\n🥑\n• 8 ...
dtype: object

### Strip off instructions, so as to only predict ingredients.

In [None]:
def strip_off_instrs(recipe):
    return recipe.split('🥣')[0]

recipes = recipes.apply(lambda recipe: strip_off_instrs(recipe))

In [None]:
recipes[0]

'🍴 Slow Cooker Chicken and Dumplings\n\n🥑\n• 4 skinless, boneless chicken breast halves\n• 2 tablespoons butter\n• 2 (10.75 ounce) cans condensed cream of chicken soup\n• 1 onion, finely diced\n• 2 (10 ounce) packages refrigerated biscuit dough, torn into pieces\n\n'

## Vectorize the text

In [None]:
# This makes the dataset tiny
# recipes = recipes[:100]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
to_exclude = '#$&*/<=>@[\\]^_`{|}~\t'
to_tokenize = '.,:;!?"-+()%\'\n'

def separate_punct(text):
    return re.sub(r'(['+to_tokenize+'])', r' \1 ', text)

recipes = recipes.progress_apply(lambda recipe: separate_punct(recipe))

100%|██████████| 105789/105789 [00:02<00:00, 36294.19it/s]


In [None]:
# tokenizer = Tokenizer(char_level=False, filters='', lower=False, split=' ', oov_token='<UNK>')
tokenizer = Tokenizer(char_level=False, filters=to_exclude, lower=True, split=' ', oov_token='<UNK>')
tokenizer.fit_on_texts(recipes)

In [None]:
encoded_recipes = tokenizer.texts_to_sequences(recipes)

In [None]:
encoded_recipes[0]

[11,
 677,
 714,
 46,
 16,
 1360,
 2,
 2,
 12,
 2,
 3,
 8,
 219,
 6,
 175,
 46,
 231,
 288,
 2,
 3,
 5,
 15,
 27,
 2,
 3,
 5,
 13,
 122,
 68,
 398,
 31,
 14,
 205,
 322,
 44,
 74,
 46,
 159,
 2,
 3,
 4,
 42,
 6,
 58,
 59,
 2,
 3,
 5,
 13,
 122,
 31,
 14,
 347,
 578,
 1261,
 424,
 6,
 367,
 43,
 99,
 2,
 2]

In [None]:
recipe_maxlen = max([len(r) for r in encoded_recipes])
recipe_maxlen

336

In [None]:
padded_recipes = pad_sequences(encoded_recipes,
                               padding='post',
                               truncating='post',
                               maxlen=recipe_maxlen+1) # Guarantee at least 1 padding character at end

In [None]:
padded_recipes

array([[  11,  677,  714, ...,    0,    0,    0],
       [  11, 2312,  677, ...,    0,    0,    0],
       [  11,   93,   26, ...,    0,    0,    0],
       ...,
       [  11,  280,  484, ...,    0,    0,    0],
       [  11,   87,  178, ...,    0,    0,    0],
       [  11, 4650, 1915, ...,    0,    0,    0]], dtype=int32)

In [None]:
vocab_size = len(tokenizer.word_counts) + 2
idx2word = tokenizer.sequences_to_texts([[idx] for idx in range(vocab_size)])


In [None]:
idx2word

['<UNK>',
 '<UNK>',
 '\n',
 '•',
 '1',
 '2',
 ',',
 'cup',
 '4',
 'teaspoon',
 '3',
 '🍴',
 '🥑',
 '(',
 ')',
 'tablespoons',
 'and',
 'chopped',
 'cups',
 'salt',
 'pepper',
 'ground',
 'tablespoon',
 'fresh',
 'oil',
 'or',
 'sugar',
 'butter',
 'to',
 'garlic',
 'black',
 'ounce',
 'teaspoons',
 'sliced',
 'white',
 'ounces',
 'olive',
 '8',
 'cheese',
 'for',
 'red',
 'large',
 'onion',
 'into',
 'cream',
 'with',
 'chicken',
 'freshly',
 'sauce',
 'pound',
 'cut',
 'juice',
 'flour',
 '6',
 'minced',
 'lemon',
 'water',
 'taste',
 'finely',
 'diced',
 'peeled',
 'powder',
 'leaves',
 'cloves',
 'can',
 'grated',
 'milk',
 'green',
 '.',
 'all-purpose',
 'kosher',
 'eggs',
 'dried',
 "'",
 'of',
 'vanilla',
 'pounds',
 '5',
 'small',
 'vinegar',
 'vegetable',
 'about',
 'egg',
 'tomatoes',
 'plus',
 'baking',
 'as',
 'chocolate',
 'extract',
 'thinly',
 'unsalted',
 'package',
 'wine',
 'brown',
 'slices',
 'parsley',
 'shredded',
 'whole',
 'drained',
 'pieces',
 'bread',
 'medium',

In [None]:
idx2word[1]

'<UNK>'

In [None]:
print(vocab_size)
print(len(padded_recipes))

32179
105789


# Create training examples and targets

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(padded_recipes)

In [None]:
# For each sequence, duplicate and shift it to form the input and target text
def split_input_target(recipe):
    input_text = recipe[:-1]
    target_text = recipe[1:]
    return input_text, target_text

dataset = dataset.map(split_input_target)

In [None]:
BATCH_SIZE = 1 # ONLINE LEARNING NOW - I hope this works
BUFFER_SIZE = 1000  # Buffer size to shuffle the dataset

# Create training batches
dataset = dataset \
    .shuffle(BUFFER_SIZE) \
    .batch(BATCH_SIZE, drop_remainder=True) \
    .repeat() # Repeat to have enough data during training

# Build the model

In [None]:
def build_model(vocab_size, embed_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embed_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer='glorot_uniform'
        ),
        tf.keras.layers.Dense(vocab_size)
    ])

    return model

In [None]:
EMBED_DIM = 256
RNN_UNITS = 512
MODEL_NAME = "rnn_word_punct_emoji_ingr_online"

In [None]:
model = build_model(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    rnn_units=RNN_UNITS,
    batch_size=BATCH_SIZE)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (1, None, 256)            8237824   
_________________________________________________________________
gru (GRU)                    (1, None, 512)            1182720   
_________________________________________________________________
dense (Dense)                (1, None, 32179)          16507827  
Total params: 25,928,371
Trainable params: 25,928,371
Non-trainable params: 0
_________________________________________________________________


In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, show_shapes=True)

AssertionError: ignored

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape) # (batch_size, sequence_length, vocab_size)

(1, 336, 32179)


In [None]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    
example_batch_loss = loss(target_example_batch, example_batch_predictions)
prediction_shape = example_batch_predictions.shape
scalar_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", prediction_shape)
print("scalar_loss:      ", scalar_loss)
print("exp(scalar_loss): ", np.exp(scalar_loss))
print("vocab size      : ", vocab_size)
print("If all went right, exp(scalar loss) should be approximately equal to vocab size")

Prediction shape:  (1, 336, 32179)
scalar_loss:       10.376923
exp(scalar_loss):  32109.994
vocab size      :  32179
If all went right, exp(scalar loss) should be approximately equal to vocab size


In [None]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    patience=5,
    monitor='loss',
    restore_best_weights=True,
    verbose=1
)

In [None]:
# Create a checkpoints directory.
checkpoint_dir = os.path.join(CACHE_DIR, MODEL_NAME)
os.makedirs(checkpoint_dir, exist_ok=True)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'ckpt_{epoch}'),
    save_weights_only=True
)

In [None]:
def restore_checkpoint(model):
    latest_checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)

    if not latest_checkpoint_path:
        print('Checkpoint not found')
        return model, 0

    print("Checkpoint found")
    print('Path:', latest_checkpoint_path)

    model.load_weights(latest_checkpoint_path)

    latest_checkpoint_name = os.path.split(latest_checkpoint_path)[-1]
    print('Name:', latest_checkpoint_name)

    latest_epoch = latest_checkpoint_name.split('_')[-1]
    print('Epoch:', latest_epoch)

    return model, int(latest_epoch)

In [None]:
TOTAL_EPOCHS = 10
STEPS_PER_EPOCH = 1500

model, initial_epoch = restore_checkpoint(model)

Checkpoint not found


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=loss
)

## Train model

In [None]:
history = model.fit(
    x=dataset,
    epochs=TOTAL_EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    initial_epoch=initial_epoch,
    verbose=True,
    callbacks=[
        checkpoint_callback,
        early_stopping_callback
    ]
)

# Saving the trained model to file (to be able to re-use it later).
model_name = os.path.join(CACHE_DIR, MODEL_NAME, f'{MODEL_NAME}.h5')
model.save(model_name, save_format='h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Predictions

In [None]:
def generate_recipe(model, start_string, num_generate=1000, temperature=0.8):
    TITLE_START = "🍴 "
    # Evaluation step (generating text using the learned model)

    padded_start_string = TITLE_START + start_string

    # Converting our start string to numbers (vectorizing).
    input_eval = np.array(tokenizer.texts_to_sequences([padded_start_string]))

    # Empty string to store our results.
    text_generated = []

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the token returned by the model
        predictions = predictions / temperature

        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        if predicted_id == 0: # stop if we start generating the padding token
            break

        # Pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2word[predicted_id])

    return (padded_start_string + ' '.join(text_generated))

In [None]:
# Restore latest checkpoint and change batch to 1
model = build_model(vocab_size, EMBED_DIM, RNN_UNITS, batch_size=1)
model, _ = restore_checkpoint(model)
model.build(tf.TensorShape([1, None]))

Checkpoint found
Path: ./drive/Shared drives/Capstone/tmp/rnn_word_punct_emoji_ingr_online/ckpt_10
Name: ckpt_10
Epoch: 10


In [None]:
generated_text = generate_recipe(model, start_string="Slow Cooked Chicken 🥑", num_generate=500, temperature=0.7)
print(generated_text)

🍴 Slow Cooked Chicken 🥑
 • 7 8 cup brown sugar 
 • 1 tablespoon butter 
 • 1 teaspoon salt 
 • 1 dash salt 
 • 1 tablespoon milk powder 
 • 1 4 teaspoon salt 
 • 1 2 teaspoon ground cinnamon 
 • salt and pepper to taste 
 • 2 tablespoons all-purpose flour 
 • 3 tablespoons all-purpose flour 
 • 1 teaspoon baking powder 
 • 1 teaspoon baking powder 
 • 1 teaspoon baking soda 
 • 1 ( 8 ounce ) package cream cheese , softened 
 • 1 teaspoon vanilla extract 
 • 1 cup all-purpose flour 
 • 1 4 cup shredded mozzarella cheese 
 • 1 ( 14 . 5 ounce ) can condensed cream of mushroom soup 
 • 1 tablespoon chopped fresh parsley 
 • 1 tablespoon lemon juice 
 • 1 tablespoon crunchy monkey , chopped 
 • 1 tablespoon chopped fresh parsley 
 • 1 tablespoon chopped onion 
 • 1 tablespoon grated parmesan cheese 
 • 1 ( 14 . 5 ounce ) can creamed corn , drained and chopped 
 • 1 pinch salt , or to taste 
 • 1 teaspoon onion powder 
 • 1 teaspoon garlic powder 
 • 1 4 teaspoon garlic powder 
 • 1 teaspoon

# Generate evaluation recipes

In [None]:
# Set to True for generating recipes for evaluation
GENERATE_EVAL_RECIPES = True

In [None]:
import random
from tqdm import tqdm

if GENERATE_EVAL_RECIPES:
    col_recipes = pd.read_pickle('/content/drive/Shared drives/Capstone/tmp/recipes.pkl')
    recipe_titles = col_recipes.filter(['title']).values.ravel().tolist()
    # recipe_words = set()

    # Create vocabulary of words used in recipe titles as potential inputs for generation
    # for title in recipe_titles:
    #     for word in title.split(' '):
    #         if len(word) > 0: # Don't add empty string
    #             recipe_words.add(word)

    eval_output_dir = os.path.join(CACHE_DIR, "rnn_word_punct_emoji_online_title_prompt_ingrs")
    pathlib.Path(eval_output_dir).mkdir(exist_ok=True)
    
    # Generate N recipes using random title as input to model
    N = 500
    used_titles = []

    for i in tqdm(range(N)):
        keyword = random.choice(tuple(recipe_titles))
        used_titles.append(keyword)
        output_file_name = os.path.join(eval_output_dir, f"rnn_word_title_ingr_{i}.txt")

        start_string = keyword + "\n\n 🥑 "

        r = generate_recipe(model, start_string=start_string, temperature=0.8)
        with open(output_file_name, 'w') as f:
            f.write(r)
    
    # Save used titles for later reference
    title_list_file = os.path.join(eval_output_dir, f"titles.txt")
    with open(title_list_file, 'w') as f:
        f.writelines("%s\n" % t for t in used_titles)


  0%|          | 0/500 [00:00<?, ?it/s][A
  0%|          | 1/500 [00:00<04:48,  1.73it/s][A
  0%|          | 2/500 [00:01<05:09,  1.61it/s][A
  1%|          | 3/500 [00:02<06:23,  1.30it/s][A
  1%|          | 4/500 [00:05<12:05,  1.46s/it][A
  1%|          | 5/500 [00:07<12:39,  1.53s/it][A
  1%|          | 6/500 [00:08<11:33,  1.40s/it][A
  1%|▏         | 7/500 [00:09<11:03,  1.35s/it][A
  2%|▏         | 8/500 [00:09<08:44,  1.07s/it][A
  2%|▏         | 9/500 [00:11<09:30,  1.16s/it][A
  2%|▏         | 10/500 [00:12<09:29,  1.16s/it][A
  2%|▏         | 11/500 [00:13<09:27,  1.16s/it][A
  2%|▏         | 12/500 [00:14<07:51,  1.04it/s][A
  3%|▎         | 13/500 [00:15<08:42,  1.07s/it][A
  3%|▎         | 14/500 [00:17<10:17,  1.27s/it][A
  3%|▎         | 15/500 [00:17<08:30,  1.05s/it][A
  3%|▎         | 16/500 [00:19<09:17,  1.15s/it][A
  3%|▎         | 17/500 [00:21<11:40,  1.45s/it][A
  4%|▎         | 18/500 [00:22<12:09,  1.51s/it][A
  4%|▍         | 19/500 [00:2

In [None]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x7f49e0529490>