In [1]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/genAI' # Please adjust the path accordingly
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/genAI


# 🥙 LSTM on Recipe Data

In this notebook, we'll walk through the steps required to train your own LSTM on the recipes dataset

In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

## 0. Parameters <a name="parameters"></a>

In [3]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

## 1. Load the data <a name="load"></a>

In [4]:
# Load the full dataset
with open("full_format_recipes.json") as json_data:
    recipe_data = json.load(json_data)

In [5]:
recipe_data[0]

{'directions': ['1. Place the stock, lentils, celery, carrot, thyme, and salt in a medium saucepan and bring to a boil. Reduce heat to low and simmer until the lentils are tender, about 30 minutes, depending on the lentils. (If they begin to dry out, add water as needed.) Remove and discard the thyme. Drain and transfer the mixture to a bowl; let cool.',
  '2. Fold in the tomato, apple, lemon juice, and olive oil. Season with the pepper.',
  '3. To assemble a wrap, place 1 lavash sheet on a clean work surface. Spread some of the lentil mixture on the end nearest you, leaving a 1-inch border. Top with several slices of turkey, then some of the lettuce. Roll up the lavash, slice crosswise, and serve. If using tortillas, spread the lentils in the center, top with the turkey and lettuce, and fold up the bottom, left side, and right side before rolling away from you.'],
 'fat': 7.0,
 'date': '2006-09-01T04:00:00.000Z',
 'categories': ['Sandwich',
  'Bean',
  'Fruit',
  'Tomato',
  'turkey',

In [6]:
# Filter the dataset
filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in recipe_data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]

In [7]:
# Count the recipes
n_recipes = len(filtered_data)
print(f"{n_recipes} recipes loaded")

20111 recipes loaded


In [8]:
example = filtered_data[9]
print(example)

Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato salad, over ham.


## 2. Tokenise the data

In [9]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [10]:
# Display an example of a recipe
example_data = text_data[9]
example_data

'Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas | Chop enough parsley leaves to measure 1 tablespoon ; reserve . Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan , covered , 5 minutes . Meanwhile , sprinkle gelatin over water in a medium bowl and let soften 1 minute . Strain broth through a fine - mesh sieve into bowl with gelatin and stir to dissolve . Season with salt and pepper . Set bowl in an ice bath and cool to room temperature , stirring . Toss ham with reserved parsley and divide among jars . Pour gelatin on top and chill until set , at least 1 hour . Whisk together mayonnaise , mustard , vinegar , 1 / 4 teaspoon salt , and 1 / 4 teaspoon pepper in a large bowl . Stir in celery , cornichons , and potatoes . Pulse peas with marjoram , oil , 1 / 2 teaspoon pepper , and 1 / 4 teaspoon salt in a food processor to a coarse mash . Layer peas , then potato salad , over ham . '

In [11]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [12]:
for batch in text_ds:
  print(batch)
  break

tf.Tensor(
[b'Recipe for Grilled Boneless Salmon Steaks with Horseradish Dill Butter | Stir together butter , horseradish , dill , shallot , 1 / 4 teaspoon salt , and 1 / 4 teaspoon pepper in a bowl with a rubber spatula until combined well . Transfer butter to a sheet of parchment or wax paper and roll into a 1 - inch - thick cylinder . Twist ends of parchment to close , then chill until firm , about 20 minutes . Prepare grill for direct - heat cooking over medium - hot charcoal . Put salmon steaks on a cutting board with tail - like flaps nearest you . Halve 1 steak lengthwise by cutting along each side of large center bone . Remove and discard bones ( you will end up with 2 thin pieces of fillet ) . Flip 1 piece over , turning it so rounded thicker portion is nearest you . Push halves together to create a yin - yang presentation , wrapping tail - like flaps around outside of steak . Wrap a piece of kitchen string around outside of steak , going around once or twice , and tie ends . 

In [13]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [14]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [15]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: .
3: ,
4: and
5: to
6: in
7: the
8: with
9: a


In [16]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[  26   16  557    1    8  298  335  189    4 1054  494   27  332  228
  235  262    5  594   11  133   22  311    2  332   45  262    4  671
    4   70    8  171    4   81    6    9   65   80    3  121    3   59
   12    2  299    3   88  650   20   39    6    9   29   21    4   67
  529   11  164    2  320  171  102    9  374   13  643  306   25   21
    8  650    4   42    5  931    2   63    8   24    4   33    2  114
   21    6  178  181 1245    4   60    5  140  112    3   48    2  117
  557    8  285  235    4  200  292  980    2  107  650   28   72    4
  108   10  114    3   57  204   11  172    2   73  110  482    3  298
    3  190    3   11   23   32  142   24    3    4   11   23   32  142
   33    6    9   30   21    2   42    6  353    3 3224    3    4  150
    2  437  494    8 1281    3   37    3   11   23   15  142   33    3
    4   11   23   32  142   24    6    9  291  188    5    9  412  572
    2  230  494    3   46  335  189    3   20  557    2    0    0    0
    0 

In [17]:
len(example_tokenised)

201

## 3. Create the Training Set

In [18]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

In [19]:
for sample in train_ds:
  print(sample)
  break

(<tf.Tensor: shape=(32, 200), dtype=int64, numpy=
array([[  26,   16, 1473, ...,    0,    0,    0],
       [  26,   16,  479, ...,    2,  710, 1263],
       [  26,   16, 1340, ...,    5, 1119,    7],
       ...,
       [  26,   16,  525, ...,   28,  167,   13],
       [  26,   16,  272, ...,    0,    0,    0],
       [  26,   16,  479, ...,    0,    0,    0]])>, <tf.Tensor: shape=(32, 200), dtype=int64, numpy=
array([[  16, 1473,  472, ...,    0,    0,    0],
       [  16,  479,  325, ...,  710, 1263,   38],
       [  16, 1340,  397, ..., 1119,    7,  290],
       ...,
       [  16,  525,  335, ...,  167,   13,  541],
       [  16,  272,   13, ...,    0,    0,    0],
       [  16,  479,   91, ...,    0,    0,    0]])>)


## 4. Build the LSTM <a name="build"></a>

In [20]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, None, 128)         117248    
                                                                 
 dense (Dense)               (None, None, 10000)       1290000   
                                                                 
Total params: 2407248 (9.18 MB)
Trainable params: 2407248 (9.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## 5. Train the LSTM <a name="train"></a>

In [21]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [22]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [23]:
# Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint_lstm/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

In [None]:
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

Epoch 1/25
generated text:
recipe for pine containing resting scotch rhubarb | bring boil until . olive and they butter in - beat , the radishes . sprinkle to your side flakes with parsley . 1 about 35 minutes . cook . work mixture after a apricots . about brown , then medium to the 2 oil to room coleslaw and floured brown and rises and medium bowl , until beat over water in when mortar with a inch verde or then pitas along into waxed salt 

Epoch 2/25
generated text:
recipe for slowest sake dip in herb sauce | preheat oven to 350°f . preheat oven to add mustard and 

Epoch 3/25
generated text:
recipe for stock alexander | heat 2 tablespoons oil in a large pot over medium heat . cool slightly . slice avocado and 1 / 2 platter . chive main meatballs and gently chop chutney until smooth . roast letting back over . from green rice and chill . skins . chill 1 tablespoon sugar , lemon juice , 2 tablespoons strips and transfer to baking container . mix all fennel , vanilla cooking cream , ca

Epoch 15/25
generated text:
recipe for sweet - thirds bread with beet and tomato sauce | preheat oven to 350°f . spray large rimmed baking sheet with medium nonstick . mix flour , salt and pepper in small bowl . set , stir egg yolks 1 / 4 cup canned olive into milk . beat in flour mixture , eggs , and vinegar . add butter and mustard and stir egg mixture until blended . increase heat ; add pod and spray ; sauté until most of rhubarb are just melted , adding additional milk if necessary . pour filling over pudding . bake

Epoch 16/25
generated text:
recipe for cheesecake with moroccan fudge latkes | preheat oven to 350°f . butter four 3 - inch - deep ovenproof baking dish ( same pan ) . pour enough water into bowl to stiff bag at cornmeal . cover ; let stand until foamy , about 10 minutes . using electric mixer on high speed , fry until light brown , 10 minutes . drain and in small bowl whisk together mango , cream , oregano , and salt . add herb and stir mixture on low , adding more wa

<keras.callbacks.History at 0x247022a7100>

In [None]:
# Save the final model
# lstm.save("./models/lstm")



INFO:tensorflow:Assets written to: ./models/lstm\assets


INFO:tensorflow:Assets written to: ./models/lstm\assets


In [31]:
# Load the model
lstm = models.load_model("./models/lstm", compile=False)
text_generator.model = lstm

In [24]:
#load model weights from h5 file
lstm.load_weights("./models/lstm.h5")
text_generator.model = lstm

## 6. Generate text using the LSTM

In [32]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [33]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=50, temperature=0.2
)


generated text:
recipe for roasted vegetables | chop 1 / 2 cup cold water and reserve . add 1 / 2 cup water , 1 / 2 cup water , 1 / 2 cup water , 1 / 2 cup water , and 1 / 2 teaspoon salt and 1 / 2



In [34]:
print_probs(info, vocab)


PROMPT: recipe for roasted vegetables | chop 1 /
2:   	94.26%
4:   	5.73%
3:   	0.01%
8:   	0.0%
1:   	0.0%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2
cup:   	99.96%
inch:   	0.04%
teaspoon:   	0.0%
-:   	0.0%
tablespoon:   	0.0%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2 cup
cold:   	84.04%
of:   	8.9%
garlic:   	4.93%
chopped:   	1.88%
.:   	0.11%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2 cup cold
water:   	100.0%
.:   	0.0%
cooking:   	0.0%
juice:   	0.0%
,:   	0.0%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2 cup cold water
and:   	56.33%
.:   	43.65%
;:   	0.02%
in:   	0.0%
to:   	0.0%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2 cup cold water and
reserve:   	63.55%
place:   	18.95%
drain:   	8.09%
squeeze:   	6.38%
add:   	1.03%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2 cup cold water and reserve
.:   	99.11%
1:   	0.46%
for:   	0.23%
any:   	0.13%
2:   	0.04%
-------

In [35]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=50, temperature=1
)


generated text:
recipe for roasted vegetables | chop 1 / 4 cup nuts . heat oil . add onion , garlic , red pepper flakes , and salt and pepper to taste and reserve . cook garlic , until chicken is tender and glowing layer on a baking sheet and broil ,



In [36]:
print_probs(info, vocab)


PROMPT: recipe for roasted vegetables | chop 1 /
2:   	55.63%
4:   	31.77%
3:   	8.94%
8:   	1.02%
1:   	0.63%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4
cup:   	73.59%
inch:   	16.32%
teaspoon:   	2.08%
-:   	1.69%
of:   	1.1%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4 cup
cold:   	7.91%
garlic:   	5.36%
of:   	5.09%
chopped:   	4.26%
green:   	2.78%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4 cup nuts
and:   	47.66%
.:   	24.9%
;:   	7.52%
in:   	6.4%
with:   	3.97%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4 cup nuts .
combine:   	8.19%
in:   	8.01%
heat:   	7.14%
cut:   	6.95%
place:   	6.11%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4 cup nuts . heat
oil:   	50.2%
1:   	9.76%
2:   	9.03%
olive:   	4.76%
a:   	3.37%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4 cup nuts . heat oil
in:   	94.07%
and:   	1.07%
,:   	1.06%
over:   	0.82%
to:   	0.5%
--------


PROMPT: recipe fo

In [37]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=200, temperature=0.5
)
#print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | bring 1 cup water to a boil in a 4 - quart heavy saucepan , then reduce heat to moderate and cook , stirring occasionally , until sugar is dissolved , about 2 minutes . remove from heat and let cool . 

