# 🥙 LSTM on Recipe Data

## Table of contents
0. [Parameters](#parameters)
1. [Prepare the Data](#prepare)
2. [Build the LSTM](#build)
3. [Train the LSTM](#train)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import json
from pprint import pprint
import random
import re
import string

# os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers, models, callbacks, activations, losses, utils, metrics, optimizers, preprocessing, datasets

from utils.image import display
from utils.datasets import sample_batches, sample_batch
from utils.losses import root_mean_squared_error

## 0. Parameters <a name="parameters"></a>

In [3]:
VOCAB_SIZE = 20000  # Only consider the top 20k words
MAX_LEN = 200  # Only consider the first 200 words of each movie review
EMBEDDING_DIM = 128
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = True

## 1. Load the data <a name="prepare"></a>

In [4]:
# Load the full dataset
with open('/app/data/epirecipes/full_format_recipes.json') as json_data:
    recipe_data = json.load(json_data)
    json_data.close()
    

In [27]:
# Filter the dataset
filtered_data = [x for x in recipe_data
              if 'calories' in x 
              and x['calories'] is not None
              and x['calories'] > 0
              and x['calories'] < 1000
              and 'ingredients' in x
              and x['ingredients'] is not None
             ]
n_recipes = len(filtered_data)

random.seed(SEED)
random.shuffle(filtered_data)

def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r' \1 ', s)
    s = re.sub(' +', ' ', s)
    return s


filtered_data = [pad_punctuation(' '.join(x['directions'])) for x in filtered_data]

filtered_ds = tf.data.Dataset.from_tensor_slices(filtered_data)

print(f'{n_recipes} recipes loaded')

14557 recipes loaded


In [33]:
filtered_data[14000]

'In an enamel kettle , combine the red wine , ruby port , the plum or orange brandy , lemon and orange slices , pieces of cinnamon stick , and the cloves . Bring the mixture to a simmer over low heat and simmer it for 15 minutes . Transfer the hot mulled wine mélange to a heat - proof punch bowl , rinsed with boiling water beforehand . '

In [26]:
# Display an example of a recipe
pprint(list(filtered_ds.take(1)))

[<tf.Tensor: shape=(), dtype=string, numpy=b'Heat 3 tablespoons oil in heavy large skillet over medium heat . Add onion and next 4 ingredients . Cover ; cook until vegetables release their juices , stirring occasionally , about 12 minutes . Uncover ; saut\xc3\xa9 until juices evaporate , about 10 minutes . Add garlic ; cook until vegetables are very tender and just beginning to brown , about 12 minutes longer . Cool completely . Puree vegetable mixture and sour cream in processor until almost smooth . Season generously with salt and pepper . Transfer to bowl . Sprinkle with capers . Preheat broiler . Arrange bread slices on baking sheet . Brush both sides of bread slices with 1 / 3 cup oil . Toast about 2 minutes per side . Cool . Serve with spread . A combination of dried thyme , basil , savory and fennel seeds can be substituted for herbes de Provence . '>]


## Prepare the data

In [7]:
# Vectorise the text into ints
vectorize_layer = layers.TextVectorization(
    standardize = 'lower',
    max_tokens=VOCAB_SIZE - 1,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

# Adapt the mapping to the training set
vectorize_layer.adapt(filtered_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices    
print(vocab[:20])

['', '[UNK]', '.', ',', 'and', 'to', 'in', 'the', 'a', 'until', 'with', '1', 'minutes', '-', 'of', '2', 'add', 'heat', 'about', 'over']


In [8]:
vectorize_layer(list(filtered_ds.take(1)))

<tf.Tensor: shape=(1, 201), dtype=int64, numpy=
array([[  17,   30,   85,   36,    6,   67,   28,   54,   19,   26,   17,
           2,   16,  116,    4,  539,   29,  117,    2,   50,   24,   40,
           9,  200,  722,  749,  285,    3,   43,   91,    3,   18,  183,
          12,    2,  712,   24,  127,    9,  285, 2105,    3,   18,   79,
          12,    2,   16,   87,   24,   40,    9,  200,   77,  204,   81,
           4,   84,  431,    5,   93,    3,   18,  183,   12,  366,    2,
          56,  199,    2,  363,  411,   25,    4,  470,   76,    6,  175,
           9,  412,  134,    2,   68,  520,   10,   23,    4,   34,    2,
          38,    5,   20,    2,   89,   10,  844,    2,   83,  392,    2,
         154,  203,  157,   27,   55,  103,    2,  191,  410,  135,   14,
         203,  157,   10,   11,   21,   30,   51,   36,    2,  450,   18,
          15,   12,  299,  102,    2,   56,    2,   69,   10,  167,    2,
           8, 2760,   14,  553,  321,    3,  403,    3, 2414,   

In [9]:
def prepare_lm_inputs_labels(text):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


text_ds = filtered_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

In [10]:
text_ds.take(1).get_single_element()

(<tf.Tensor: shape=(1, 200), dtype=int64, numpy=
 array([[  17,   30,   85,   36,    6,   67,   28,   54,   19,   26,   17,
            2,   16,  116,    4,  539,   29,  117,    2,   50,   24,   40,
            9,  200,  722,  749,  285,    3,   43,   91,    3,   18,  183,
           12,    2,  712,   24,  127,    9,  285, 2105,    3,   18,   79,
           12,    2,   16,   87,   24,   40,    9,  200,   77,  204,   81,
            4,   84,  431,    5,   93,    3,   18,  183,   12,  366,    2,
           56,  199,    2,  363,  411,   25,    4,  470,   76,    6,  175,
            9,  412,  134,    2,   68,  520,   10,   23,    4,   34,    2,
           38,    5,   20,    2,   89,   10,  844,    2,   83,  392,    2,
          154,  203,  157,   27,   55,  103,    2,  191,  410,  135,   14,
          203,  157,   10,   11,   21,   30,   51,   36,    2,  450,   18,
           15,   12,  299,  102,    2,   56,    2,   69,   10,  167,    2,
            8, 2760,   14,  553,  321,    3,  403, 

## 2. Build the LSTM <a name="build"></a>

In [72]:
inputs = layers.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
# Add 2 bidirectional LSTMs
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
# Add a classifier
outputs = layers.Dense(VOCAB_SIZE)(x)
model = models.Model(inputs, outputs)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         2560000   
                                                                 
 lstm_2 (LSTM)               (None, None, 128)         131584    
                                                                 
 lstm_3 (LSTM)               (None, None, 128)         131584    
                                                                 
 dense_1 (Dense)             (None, None, 20000)       2580000   
                                                                 
Total params: 5,403,168
Trainable params: 5,403,168
Non-trainable params: 0
_________________________________________________________________


In [73]:
class TextGenerator(callbacks.Callback):
    
    def __init__(self, index_to_word, max_tokens, top_k=10):
        self.max_tokens = max_tokens
        self.index_to_word = index_to_word
        self.k = top_k    
        self.word_to_index = {}
        for index, word in enumerate(index_to_word):
            self.word_to_index[word] = index

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds), indices, preds 
    
    def generate(self, start_prompt):
        start_tokens = [self.word_to_index.get(x, 1) for x in start_prompt.split()]
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = MAX_LEN - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:MAX_LEN]
                sample_index = MAX_LEN - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y = self.model.predict(x)
            sample_token, indices, preds = self.sample_from(y[0][sample_index])
            
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
            # import pdb
            # pdb.set_trace()
        txt = " ".join([self.index_to_word[x] for x in start_tokens + tokens_generated])
            
        print(f"generated text:\n{txt}\n")
        
    def on_epoch_end(self, epoch, logs=None):
        self.generate("")
        

In [74]:
if LOAD_MODEL:
    # model.load_weights('./models/model')
    models.load_model('./models/model', compile=False)

KeyboardInterrupt: 

## 3. Train the LSTM <a name="train"></a>

In [None]:
loss_fn = losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile("adam", loss_fn)

In [None]:
# Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab, max_tokens = 40)

In [None]:
model.fit(
    text_ds, 
    batch_size=32, 
    epochs=25, 
    # steps_per_epoch = 10,
    callbacks = [model_checkpoint_callback, tensorboard_callback, text_generator]
)

In [18]:
# Save the final models
model.save("./models/model")

2022-05-16 09:39:19.925666: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./models/model/assets


INFO:tensorflow:Assets written to: ./models/model/assets


In [71]:
text_generator.generate("heat 3")

generated text:
heat 3                                                                                  



In [35]:
preds = model.predict(text_ds.take(1))

In [42]:
np.argmax(preds[0][0])

30

In [38]:
text_ds.take(1).get_single_element()

(<tf.Tensor: shape=(1, 200), dtype=int64, numpy=
 array([[  17,   30,   85,   36,    6,   67,   28,   54,   19,   26,   17,
            2,   16,  116,    4,  539,   29,  117,    2,   50,   24,   40,
            9,  200,  722,  749,  285,    3,   43,   91,    3,   18,  183,
           12,    2,  712,   24,  127,    9,  285, 2105,    3,   18,   79,
           12,    2,   16,   87,   24,   40,    9,  200,   77,  204,   81,
            4,   84,  431,    5,   93,    3,   18,  183,   12,  366,    2,
           56,  199,    2,  363,  411,   25,    4,  470,   76,    6,  175,
            9,  412,  134,    2,   68,  520,   10,   23,    4,   34,    2,
           38,    5,   20,    2,   89,   10,  844,    2,   83,  392,    2,
          154,  203,  157,   27,   55,  103,    2,  191,  410,  135,   14,
          203,  157,   10,   11,   21,   30,   51,   36,    2,  450,   18,
           15,   12,  299,  102,    2,   56,    2,   69,   10,  167,    2,
            8, 2760,   14,  553,  321,    3,  403, 

In [45]:
vocab[17]

'heat'