# 🥙 LSTM on Recipe Data

## Table of contents
0. [Parameters](#parameters)
1. [Load the Data](#load)
1. [Prepare the Data](#prepare)
2. [Build the LSTM](#build)
3. [Train the LSTM](#train)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import json
from pprint import pprint
import random
import re
import string

# os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
import tensorflow.keras as keras

from utils.image import display
from utils.datasets import sample_batches, sample_batch
from utils.losses import root_mean_squared_error

## 0. Parameters <a name="parameters"></a>

In [3]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 128
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32

## 1. Load the data <a name="load"></a>

In [4]:
# Load the full dataset
with open('/app/data/epirecipes/full_format_recipes.json') as json_data:
    recipe_data = json.load(json_data)
    json_data.close()
    

In [5]:
# Filter and shuffle the dataset
filtered_data = [x for x in recipe_data
              if 'calories' in x 
              and x['calories'] is not None
              and x['calories'] > 0
              and x['calories'] < 1000
              and 'ingredients' in x
              and x['ingredients'] is not None
              and 'title' in x
              and x['title'] is not None
             ]
n_recipes = len(filtered_data)

random.seed(SEED)
random.shuffle(filtered_data)

In [6]:
# Pad the punctuation, to treat them as separate 'words'  
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r' \1 ', s)
    s = re.sub(' +', ' ', s)
    return s

filtered_data = [pad_punctuation('Recipe for ' + x['title']+ '|' + ' '.join(x['directions'])) for x in filtered_data]

In [7]:
# Convert to a Dataset
filtered_ds = tf.data.Dataset.from_tensor_slices(filtered_data).batch(BATCH_SIZE)
print(f'{n_recipes} recipes loaded')

2022-05-16 12:31:02.428228: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-16 12:31:02.444746: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-16 12:31:02.445804: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


14557 recipes loaded


2022-05-16 12:31:02.447724: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-16 12:31:02.448159: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-16 12:31:02.449162: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-16 12:31:02.450213: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [8]:
# Display an example of a recipe
pprint(list(filtered_ds.take(1))[0][0])

<tf.Tensor: shape=(), dtype=string, numpy=b'Recipe for Caramelized Onion and Sour Cream Spread | Heat 3 tablespoons oil in heavy large skillet over medium heat . Add onion and next 4 ingredients . Cover ; cook until vegetables release their juices , stirring occasionally , about 12 minutes . Uncover ; saut\xc3\xa9 until juices evaporate , about 10 minutes . Add garlic ; cook until vegetables are very tender and just beginning to brown , about 12 minutes longer . Cool completely . Puree vegetable mixture and sour cream in processor until almost smooth . Season generously with salt and pepper . Transfer to bowl . Sprinkle with capers . Preheat broiler . Arrange bread slices on baking sheet . Brush both sides of bread slices with 1 / 3 cup oil . Toast about 2 minutes per side . Cool . Serve with spread . A combination of dried thyme , basil , savory and fennel seeds can be substituted for herbes de Provence . '>


## 2. Prepare the data

In [74]:
# Create a vectorisation layer
vectorize_layer = keras.layers.TextVectorization(
    standardize = 'lower',
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [75]:
# Adapt the layer to the training set
vectorize_layer.adapt(filtered_ds)
vocab = vectorize_layer.get_vocabulary()
print(vocab[:10])
print(vocab[-10:])

['', '[UNK]', '.', ',', 'and', 'to', 'in', 'the', 'with', 'a']
['calcutta', 'calabacitas', 'cal', 'cakie', 'cake—but', 'cakey', 'cairanne', 'caipirinhas', 'caipirinha', 'cafecubano']


In [76]:
# Display the same example converted to ints
vectorize_layer(list(filtered_ds.take(1))[0][0])

2022-05-16 13:02:27.846246: W tensorflow/core/data/root_dataset.cc:200] Optimization loop failed: CANCELLED: Operation was cancelled


<tf.Tensor: shape=(201,), dtype=int64, numpy=
array([  26,   16,  984,  114,    4,  422,   64,  165,   27,   18,   35,
         89,   39,    6,   70,   31,   56,   20,   30,   18,    2,   17,
        114,    4,  564,   32,  122,    2,   52,   25,   43,   10,  193,
        760,  791,  300,    3,   46,   93,    3,   19,  190,   12,    2,
        749,   25,  129,   10,  300, 2244,    3,   19,   81,   12,    2,
         17,   83,   25,   43,   10,  193,   79,  212,   84,    4,   87,
        448,    5,   95,    3,   19,  190,   12,  383,    2,   59,  206,
          2,  380,  366,   28,    4,  422,   64,    6,  179,   10,  429,
        138,    2,   71,  545,    8,   24,    4,   36,    2,   41,    5,
         21,    2,   91,    8,  800,    2,   86,  412,    2,  158,  191,
        160,   29,   58,  105,    2,  200,  427,  139,   14,  191,  160,
          8,   11,   22,   35,   53,   39,    2,  455,   19,   15,   12,
        311,  104,    2,   59,    2,   72,    8,  165,    2,    9, 2931,
     

In [77]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

text_ds = filtered_ds.map(prepare_inputs)

In [78]:
# View the training set
text_ds.take(1).get_single_element()

(<tf.Tensor: shape=(32, 200), dtype=int64, numpy=
 array([[  26,   16,  984, ...,    0,    0,    0],
        [  26,   16,  192, ...,    0,    0,    0],
        [  26,   16,  107, ...,    2,   41,  107],
        ...,
        [  26,   16, 4173, ...,    0,    0,    0],
        [  26,   16,  497, ...,   42,   10,  317],
        [  26,   16,  661, ...,   20,    9,   31]])>,
 <tf.Tensor: shape=(32, 200), dtype=int64, numpy=
 array([[  16,  984,  114, ...,    0,    0,    0],
        [  16,  192,  259, ...,    0,    0,    0],
        [  16,  107,    8, ...,   41,  107,    5],
        ...,
        [  16, 4173, 1129, ...,    0,    0,    0],
        [  16,  497,   47, ...,   10,  317,    3],
        [  16,  661, 4607, ...,    9,   31,  109]])>)

## 2. Build the LSTM <a name="build"></a>

In [138]:
inputs = keras.layers.Input(shape=(None,), dtype="int32")
x = keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = keras.layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = keras.layers.Dense(VOCAB_SIZE, activation = 'softmax')(x)
model = keras.models.Model(inputs, outputs)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 128)         1280000   
                                                                 
 lstm_2 (LSTM)               (None, None, 128)         131584    
                                                                 
 dense_2 (Dense)             (None, None, 10000)       1290000   
                                                                 
Total params: 2,701,584
Trainable params: 2,701,584
Non-trainable params: 0
_________________________________________________________________


In [143]:
class TextGenerator(keras.callbacks.Callback):
    
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {}
        for index, word in enumerate(index_to_word):
            self.word_to_index[word] = index

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs 
    
    def generate(self, start_prompt, max_tokens = 40, temperature = 1.0):
        start_tokens = [self.word_to_index.get(x, 1) for x in start_prompt.split()]
        num_tokens_generated = 0
        tokens_generated = []
        info = []
        sample_token = None
        while num_tokens_generated <= max_tokens and sample_token != 0:
            pad_len = MAX_LEN - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:MAX_LEN]
                sample_index = MAX_LEN - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y = self.model.predict(x)
            sample_token, probs = self.sample_from(y[0][sample_index], temperature)
            
            
            info.append({'prompt': start_prompt , 'word_probs': probs})
            
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            start_prompt = start_prompt + ' ' + self.index_to_word[sample_token]
            
            num_tokens_generated = len(tokens_generated)
   
        print(f"generated text:\n{start_prompt}\n")
        return info
        
    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens = 40)
        

In [144]:
if LOAD_MODEL:
    # model.load_weights('./models/model')
    keras.models.load_model('./models/model', compile=False)

## 3. Train the LSTM <a name="train"></a>

In [145]:
loss_fn = keras.losses.SparseCategoricalCrossentropy()
model.compile("adam", loss_fn)

In [146]:
# Create a model save checkpoint
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = keras.callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

In [150]:
model.fit(
    text_ds, 
    epochs=25, 
    # steps_per_epoch = 10,
    callbacks = [model_checkpoint_callback, tensorboard_callback, text_generator]
)

Epoch 1/25
recipe for sauteed toasted cucumber | red mash slotted bell pepper over toss to blend frosting . continue 

Epoch 2/25
recipe for west squash | stir in the hearts water , the time , and the wooden simmer until the sugar dressing , dot well and cook . add the mascarpone potato keeps and toss milk and salt , 3 5 4 cups

Epoch 3/25
recipe for passing sauce verde | cut 1 1 teaspoon use all pans and any sprouts from down set over a even layer , about 15 1 / 2 inches apart and it in a gas kettle until turn sandwich do not pop

Epoch 4/25
recipe for roquefort - pear collard essence | bring wine around to saucepan boil in batches , uncovered , 1 1 / 2 hours . using fork and finely chop dry . cut off flesh pulp , reserving orange juice and pear in

Epoch 5/25
recipe for roasted red - pasta wine–marinated | 1 . pour punch among tea layer . place boiling third of oven and green onion in buns in a pan oven . line 1 rimmed baking sheet with foil . put rice in bite

Epoch 6/25
recipe for 

KeyboardInterrupt: 

In [None]:
# Save the final models
model.save("./models/model")

# 3. Generate text using the LSTM

In [129]:
def print_probs(info, vocab, top_k = 5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i['word_probs']
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f'{vocab[i]}:   \t{np.round(100*p,2)}%') 
        print('--------\n')

In [151]:
info = text_generator.generate("recipe for", max_tokens = 40, temperature = 1.0)

generated text:
recipe for chocolate southwestern cappuccino ruler puffs | preheat oven to 350°f . toss salt , sugar , and cinnamon into a double boiler set over medium heat until mixture is fluffy , stirring occasionally , to form clumps and elastic , adding



In [152]:
print_probs(info, vocab)


PROMPT: recipe for
grilled:   	2.72%
roasted:   	1.81%
chicken:   	1.65%
chocolate:   	1.37%
lemon:   	1.24%
--------


PROMPT: recipe for chocolate
-:   	22.34%
and:   	6.48%
chocolate:   	2.11%
cream:   	2.06%
,:   	1.96%
--------


PROMPT: recipe for chocolate southwestern
ice:   	3.24%
chocolate:   	3.06%
cake:   	1.64%
punch:   	1.22%
chicken:   	1.21%
--------


PROMPT: recipe for chocolate southwestern cappuccino
cake:   	22.91%
pie:   	6.45%
cream:   	5.92%
tart:   	4.31%
cookies:   	2.67%
--------


PROMPT: recipe for chocolate southwestern cappuccino ruler
|:   	78.16%
with:   	7.74%
cake:   	2.69%
in:   	1.75%
cookies:   	1.28%
--------


PROMPT: recipe for chocolate southwestern cappuccino ruler puffs
|:   	92.59%
with:   	3.38%
and:   	0.75%
cookies:   	0.43%
cake:   	0.27%
--------


PROMPT: recipe for chocolate southwestern cappuccino ruler puffs |
preheat:   	35.1%
in:   	10.07%
combine:   	5.05%
position:   	3.48%
1:   	3.43%
--------


PROMPT: recipe for chocolate so

In [None]:
info = text_generator.generate("recipe for", max_tokens = 40, temperature = 0.1)

In [None]:
print_probs(info, vocab)