# 1. Data preparation

## 1.1 Load necessary libraries and read in dataset

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tensorflow as tf
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model

In [None]:
# Read in dataset
df = pd.read_csv(
    "recipes_w_search_terms.csv", engine='python', encoding='utf-8'
)

# Clean unusual line terminators and extra spaces in all string columns
df = df.map(
    lambda x: str(x).replace('\u2028', '\n').replace('\u2029', '\n').strip()
    if isinstance(x, str) else x
)

# Look at df information
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494963 entries, 0 to 494962
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   id                   494963 non-null  int64 
 1   name                 494963 non-null  object
 2   description          485362 non-null  object
 3   ingredients          494963 non-null  object
 4   ingredients_raw_str  494963 non-null  object
 5   serving_size         494963 non-null  object
 6   servings             494963 non-null  int64 
 7   steps                494963 non-null  object
 8   tags                 494963 non-null  object
 9   search_terms         494963 non-null  object
dtypes: int64(2), object(8)
memory usage: 37.8+ MB


Unnamed: 0,id,name,description,ingredients,ingredients_raw_str,serving_size,servings,steps,tags,search_terms
0,96313,Grilled Garlic Cheese Grits,"We love grits, this is another good way to ser...","['water', 'grits', 'salt', 'cheddar cheese', '...","[""4 cups water"",""1 cup uncooked old f...",1 (155 g),8,"['I a sauce pan, bring water to a boil; slowly...","['time-to-make', 'course', 'main-ingredient', ...","{'diabetic', 'low-calorie', 'vegetarian', 'low..."
1,232037,Simple Shrimp and Andouille Jambalaya,"Simple, easy and very tasty for when you are i...","['onion', 'red bell pepper', 'garlic cloves', ...","[""1 medium onion, chopped coarse "",""1 m...",1 (366 g),4,"['In a food processor, pulse the onion, red pe...","['60-minutes-or-less', 'time-to-make', 'course...","{'dinner', 'shrimp'}"
2,41090,black-and-white bean salad,,"['white beans', 'canned black beans', 'tomatoe...","[""1 cup canned white beans, rinsed and dra...",1 (807 g),1,"['In a large bowl, combine beans, tomato, onio...","['15-minutes-or-less', 'time-to-make', 'course...","{'vegetarian', 'salad', 'side', 'dinner', 'veg..."
3,60656,Crock Pot Italian Zucchini,This is a good recipe for weight watchers. It ...,"['zucchini', 'yellow squash', 'diced tomatoes'...","[""2 zucchini, sliced "",""2 small yel...",1 (244 g),4,['Put all ingredients in the crock pot and coo...,"['weeknight', 'time-to-make', 'course', 'main-...","{'side', 'vegetarian', 'italian'}"
4,232047,Beef Stew With Dried Cherries,This is a fabulous stew that came from one of ...,"['beef stew meat', 'flour', 'salt', 'allspice'...","[""3 lbs beef stew meat"",""3 tablespoons ...",1 (358 g),8,"['Preheat oven to 350¬∞F.', ""Cut beef into 1 in...","['time-to-make', 'course', 'main-ingredient', ...",{'dinner'}


## 1.2 Data preprocessing and recipe text formatting
This section prepares the raw recipe dataset for model training by cleaning and standardising the textual data. This is to ensure that all text in the data is consistent and can be understood by the model for modelling.

Data preprocessing:
- Clean strange line breaks and extra spaces across all text columns to avoid formatting errors and ensure that text is clean for model to learn from
- Converts stringified lists into actual lists ("['egg']" --> ['egg']) such that it can be iterated through
- Combine recipe parts (title, ingredients, and instructions) into structured and formatted text for easy reading and understanding
- Stores all formatted recipes into list `dataset_stringified`

In [None]:
# Section separators (for readability)
stopword_title = "‚≠êÔ∏è "
stopword_ingredient = "\nü•¨\n\n"
stopword_instruction = "\nü•£\n\n"

# Ensures that each ingredient is properly formatted to keep text consistent before concatenating them into our string
def clean_ingredient_spaces(text):
    if isinstance(text, str):
        # Replace multiple spaces with one and trim leading/trailing spaces
        return re.sub(r'\s+', ' ', text).strip()
    elif isinstance(text, list):
        # If it's a list of ingredients, clean each one
        return [re.sub(r'\s+', ' ', t).strip() for t in text if isinstance(t, str)]
    return text

# Converts stringified Python list into actual lists that can be iterated over
def safe_extract_list(value):
    """Extract a list from stringified Python list or return empty list."""
    if isinstance(value, str):
        # Handle patterns like "['x', 'y']" or ["x", "y"]
        items = re.findall(r"'([^']+)'", value)
        if not items:
            items = re.findall(r'"([^"]+)"', value)
        return [v.strip() for v in items if v.strip()]
    elif isinstance(value, (list, tuple)):
        return list(value)
    else:
        return []

# Converts recipe into structured text
def recipe_to_string_simple(row):
    """Format recipe text using name, ingredients, and steps."""
    title = str(row.get("name", "")).strip()
    ingredients = safe_extract_list(row.get("ingredients_raw_str"))
    steps = safe_extract_list(row.get("steps"))
    # Format ingredient list
    ingredients_string = "\n".join(f"‚Ä¢ {ing}" for ing in ingredients)
    # Format step list
    instructions_string = "\n".join(f"‚ñ™Ô∏é {step.strip()}" for step in steps if step.strip())
    # Combine everything
    recipe_string = (
        f"{stopword_title}{title}\n"
        f"{stopword_ingredient}{ingredients_string}\n\n"
        f"{stopword_instruction}{instructions_string.strip()}"
    )
    return recipe_string

# Apply functions to df
df['ingredients_raw_str'] = df['ingredients_raw_str'].apply(clean_ingredient_spaces)

# Clean every ingredient list
dataset_stringified = []
for _, row in df.iterrows():
    try:
        dataset_stringified.append(recipe_to_string_simple(row))
    except Exception as e:
        print(f"Skipped recipe due to error: {e}")

# View first 3 recipes
for i, recipe_text in enumerate(dataset_stringified[:3]):
    print(f"Recipe #{i+1}\n{'-'*40}")
    print(recipe_text)
    print("\n")

Recipe #1
----------------------------------------
‚≠êÔ∏è Grilled Garlic Cheese Grits

ü•¨

‚Ä¢ 4 cups water
‚Ä¢ 1 cup uncooked old fashion grits
‚Ä¢ 1 teaspoon salt
‚Ä¢ 4 ounces shredded cheddar cheese
‚Ä¢ 1 -2 clove garlic, minced
‚Ä¢ 1 tablespoon olive oil


ü•£

‚ñ™Ô∏é I a sauce pan, bring water to a boil; slowly add grits and salt, stirring constantly; Reduce heat:simmer, uncovered, for 40-45 minutes or untill thickened, stirrin occasionally.
‚ñ™Ô∏é Add cheese and garlic; stir until cheese is melted, Spray 9-inch baking dish with nonstick cooking spray; Cover and refrigerate for 2 to 2 1/2 hours or until frim.
‚ñ™Ô∏é Before starting the grill, coat the grill rack with nonstick cooking spray; Cut the grits into 3-inch squares; Brush both sides with olive oil.
‚ñ™Ô∏é Grill, covered, over medium heat for 4 to 6 minutes on each side or until lightly browned.


Recipe #2
----------------------------------------
‚≠êÔ∏è Simple Shrimp and Andouille Jambalaya

ü•¨

‚Ä¢ 1 medium onion, c

## 1.3 Filter and reduce size of recipe corpus
Filter out recipes that are too short or too long, limiting the overall dataset size and improving data quality to make training more efficient.

- Length of each recipe is estimated by splitting using the whitespaces
- Recipes with length less than `min_len` or length more than `max_len` are removed from the dataset
- Dataset is reduced to first **80,000 recipes**, after taking into the account computational resources we have

In [None]:
corpus = dataset_stringified # list[str]

# Compute lengths (by token count approximation using whitespace)
lengths = [len(s.split()) for s in corpus]
min_len, max_len = 20, 1000

# Filter indices
keep_idx = [i for i, L in enumerate(lengths) if min_len <= L <= max_len]
corpus_filtered = [corpus[i] for i in keep_idx]

print(f"Kept {len(corpus_filtered)} / {len(corpus)} recipes (length in [{min_len},{max_len}])")

# Reduce corpus (# of recipes) size
corpus_filtered = corpus_filtered[:80000]

Kept 494882 / 494963 recipes (length in [20,1000])


## 1.4 Tokenisation
This section converts `corpus_filtered` (recipe texts) into a numerical format such that the model can understand and learn. The model will learn from sequences of numbers that represent characters.

- Initialise a character-level tokeniser for next-token prediction
- Fit the tokeniser on `corpus_filtered` to build a complete character vocabulary and assign an index to each character
- Converts all recipes to sequences of integers

In [None]:
# Tokenize
stop_sign = '|' # custom stop sumbol for padding or truncation
vocab_size = None # keep full vocab
oov_token = "<OOV>" # marks any out-of-vocabulary characters

tokenizer = Tokenizer(char_level=True, num_words=vocab_size, oov_token=oov_token, filters='', lower=True)
tokenizer.fit_on_texts([stop_sign])
tokenizer.fit_on_texts(corpus_filtered)

# Number of tokens (unique characters)
print("Vocab size:", len(tokenizer.word_index) + 1) # +1 for OOV token

Vocab size: 151


## 1.5. Standardise length of recipes
All the recipes are turned into a fixed-length sequence of numbers (either by padding or truncation) so that the model can efficiently learn to generate new recipes character by character.

- **Padding:** makes all sequences the same length by adding extra dummy tokens (`'|'`) such that the model can easily identify non-content areas.
- **Truncation:** makes all sequences the same length by shortening sequences that are too long

In [None]:
sequences = tokenizer.texts_to_sequences(corpus_filtered)

padded_without_stops = pad_sequences(sequences, maxlen=max_len-1, padding='post', truncating='post', value=tokenizer.texts_to_sequences([stop_sign])[0]) # 0 is the index of '|'
padded = pad_sequences(padded_without_stops, maxlen=max_len+1, padding='post', truncating='post', value=tokenizer.texts_to_sequences([stop_sign])[0])

print("Padded shape:", padded.shape) # (N, maxlen)


Padded shape: (80000, 1001)


## 1.6 Helper functions
- Function `sequence_to_recipe_string`: reverses the tokenisation process by converting tokenised recipe sequence (list or np.array of integers) back into a readable string version
  - takes a tokenised recipe and removes padding tokens, if any
  - converts token IDs back to characters
  - joins tokens back into a single string such that it looks like a normal recipe again
  - replaces special section tokens with readable emojis and labels for clarity
- Function `to_lm_dataset`: transforms tokenised recipes into `(input, target)` pairs so that the model can learn next-character prediction

In [None]:
def sequence_to_recipe_string(sequence, tokenizer, remove_padding=True):
    """
    Converts a tokenised recipe sequence (list or np.array of integers)
    back into a readable string version.
    """
    if isinstance(sequence, np.ndarray):
        sequence = sequence.tolist()

    if remove_padding:
        sequence = [token for token in sequence if token != 0]

    # Convert token IDs back to words
    words = [tokenizer.index_word.get(token, "<UNK>") for token in sequence]

    # Join into a single string
    recipe_str = ' '.join(words)

    # Format section tokens for readability
    recipe_str = (recipe_str
                  .replace("<TITLE>", "\n‚≠êÔ∏è ")
                  .replace("<INGR>", "\nü•¨\n")
                  .replace("<INSTR>", "\nü•£\n")
                  .replace("<END>", "\n‚úÖ End of recipe\n"))

    return recipe_str.strip()

# Example
sample_sequence = padded[10]  # 10th recipe, tokenised
recipe_text = sequence_to_recipe_string(sample_sequence, tokenizer)
print(recipe_text)


‚≠ê Ô∏è   g o l d e n   c h o c o l a t e   c h i p   m u f f i n s 
 
 ü•¨ 
 
 ‚Ä¢   1 / 2   c u p   b u t t e r 
 ‚Ä¢   1   c u p   g r a n u l a t e d   s u g a r 
 ‚Ä¢   2   t e a s p o o n s   b a k i n g   p o w d e r 
 ‚Ä¢   1 / 2   t e a s p o o n   s a l t 
 ‚Ä¢   1   t e a s p o o n   v a n i l l a   e x t r a c t 
 ‚Ä¢   2   l a r g e   e g g s 
 ‚Ä¢   1 / 2   c u p   m i l k 
 ‚Ä¢   2   c u p s   w h o l e   w h e a t   f l o u r 
 ‚Ä¢   2   c u p s   c h o c o l a t e   c h i p s 
 ‚Ä¢   c o a r s e   d e c o r a t o r   s u g a r ,   f o r   t o p p i n g 
 
 
 ü•£ 
 
 ‚ñ™ Ô∏é   p r e h e a t   t h e   o v e n   t o   3 5 0 ¬∞ f   l i g h t l y   g r e a s e   ( o r   l i n e   w i t h   m u f f i n   c u p s ,   a n d   g r e a s e   t h e   m u f f i n   c u p s )   a   s t a n d a r d - s i z e   m u f f i n   p a n . 
 ‚ñ™ Ô∏é   b e a t   t o g e t h e r   t h e   b u t t e r ,   s u g a r ,   b a k i n g   p o w d e r ,   s a l t ,   a n d   v a n i l l a   u n t i

In [None]:
def to_lm_dataset(padded_arrays):
    # padded_arrays: np.ndarray shape (N, maxlen)
    X = padded_arrays[:, :-1] # inputs: all tokens except the last one
    y = padded_arrays[:, 1:] # targets: all tokens except the first one
    # Wraps pairs (X, y) into a TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    return dataset

lm_dataset = to_lm_dataset(padded)

## 1.7 Split data into train and test
Splits data into train/test on 90/10 split. Shuffles and batches them, and prepare data for efficient use during model training.

In [None]:
train_idx, test_idx = train_test_split(np.arange(len(padded)), test_size=0.1, random_state=42)

train_arr = padded[train_idx]
test_arr = padded[test_idx]
train_ds = to_lm_dataset(train_arr).shuffle(10000).batch(64, drop_remainder=True).repeat()
test_ds = to_lm_dataset(test_arr).shuffle(10000).batch(64, drop_remainder=True)

In [None]:
# Inspect a sample batch (quick sanity check)
for input_batch, target_batch in train_ds.take(1):
    input_example = input_batch[0].numpy() # first sequence in batch
    target_example = target_batch[0].numpy() # corresponding target sequence

    print('Input sequence size:', len(input_example))
    print('Target sequence size:', len(target_example))
    print()

    # Convert token IDs back to text using tokenizer
    input_stringified = tokenizer.sequences_to_texts([input_example[:50]])[0]
    target_stringified = tokenizer.sequences_to_texts([target_example[:50]])[0]

    print('Input:', repr(''.join(input_stringified)))
    print('Target:', repr(''.join(target_stringified)))

Input sequence size: 1000
Target sequence size: 1000

Input: '‚≠ê Ô∏è   t o f u   d r e a m   p u d d i n g   a n d   p i e   f i l l i n g \n \n ü•¨ \n \n ‚Ä¢   2 4   5 / 8'
Target: 'Ô∏è   t o f u   d r e a m   p u d d i n g   a n d   p i e   f i l l i n g \n \n ü•¨ \n \n ‚Ä¢   2 4   5 / 8  '


# 2. Model building and training



## 2.1 Build model
Build a character-level Long Short-Term Memory (LSTM) language model that learns to predict the next character in a recipe, allowing it to generate entirely new recipes character by character.

The model embeds the tokens in the embedding layer. The embedding layer transforms each numeric character ID into a 256-dimensional vector which helps the model learn relationships between characters.

In [None]:
# Define key parameters
vocab_size = len(tokenizer.word_index) + 1 # no. of unique characters
embedding_dim = 256 # size of character embedding vector
rnn_units = 512 # no. of hidden units in the LSTM layer
BATCH_SIZE = 64 # no. of sequences processed together per training step

batch_input_shape=[BATCH_SIZE, None]

# Build model function
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.Input(batch_shape=(batch_size, None)),
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        tf.keras.layers.LSTM(
            units=rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer=tf.keras.initializers.GlorotNormal()
        ),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

# Build model using function above
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

Compile the model by specifying:
- the loss function (`loss`)
- the optimiser (`adam_optimizer`)
- evaluation metrics (`'accuracy'`)

In [None]:
# Loss function (how errors are measured)
def loss(labels, logits):
  entropy = tf.keras.losses.sparse_categorical_crossentropy(
      y_true=labels, y_pred=logits,from_logits=True
      )
  return entropy

# Optimiser (how the model updates its weights to reduce errors)
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(
    optimizer=adam_optimizer,
    loss=loss,
    metrics=['accuracy']
)

## 2.2 Train model


Prepare a folder to store model checkpoints (weights) during training, making it easier to save and reload training process in the future. Also saves weights for the fully trained model (epoch 20).

In [None]:
# To save weights at different epochs
checkpoint_dir = 'tmp/checkpoints' # define a checkpoint directory
os.makedirs(checkpoint_dir, exist_ok=True) # create the folder if it does not exist

In [None]:
# Configuring callbacks: stops the model early when any requirement is met
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    patience=5,
    monitor='loss',
    restore_best_weights=True,
    verbose=1
)

checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}.weights.h5')
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

To improve model accuracy and quality of output, we will run 20 epochs.

In [None]:
EPOCHS = 20

history = model.fit(
    x=train_ds,
    epochs=EPOCHS,
    steps_per_epoch=len(train_arr)//64,
    callbacks=[
        checkpoint_callback,
        early_stopping_callback
    ]
)

Epoch 1/20
[1m1125/1125[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m317s[0m 277ms/step - accuracy: 0.5946 - loss: 1.4779
Epoch 2/20
[1m1125/1125[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m313s[0m 278ms/step - accuracy: 0.7926 - loss: 0.6891
Epoch 3/20
[1m1125/1125[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m313s[0m 278ms/step - accuracy: 0.8133 - loss: 0.6114
Epoch 4/20
[1m1125/1125[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m314s[0m 279ms/step - accuracy: 0.8239 - loss: 0.5734
Epoch 5/20
[1m1125/1125[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m314s[0m 279ms/step - accuracy: 0.8302 - loss: 0.5507
Epoch 6/20
[1m1125/1125[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m314s[0m 279ms/step - accuracy: 0.8345 - loss: 0.5354
Epoc

In [None]:
# Save the model in Keras and h5 formats
model.save("recipe_model.keras")
model.save("recipe_model.h5")



In [None]:
# Evaluate trained model on test dataset
model.evaluate(test_ds)

[1m125/125[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m14s[0m 111ms/step - accuracy: 0.8465 - loss: 0.4927


[0.4873633086681366, 0.8482109904289246]

# 3. Load model
Load previously trained model that was saved such that no retraining has to be done.

In [None]:
# Load previously trained model
def loss(labels, logits):
  entropy = tf.keras.losses.sparse_categorical_crossentropy(
      y_true=labels, y_pred=logits,from_logits=True
      )
  return entropy

model = load_model("recipe_model.h5", custom_objects={"loss": loss})



In [None]:
# Create inference model (batch_size=1)
inference_model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
# Load trained weights
inference_model.set_weights(model.get_weights())

# 4. Model evaluation

## 4.1 Key Metrics

In [None]:
# Defining key metrics for evaluation

# Perplexity
def compute_perplexity(model, test_dataset, num_batches=100):
    total_loss = 0
    num_batches_processed = 0

    for x_batch, y_batch in test_dataset.take(num_batches):
        predictions = model(x_batch, training=False)
        batch_loss = tf.keras.losses.sparse_categorical_crossentropy(
            y_batch, predictions, from_logits=True
        )
        total_loss += tf.reduce_mean(batch_loss).numpy()
        num_batches_processed += 1

    avg_loss = total_loss / num_batches_processed
    perplexity = np.exp(avg_loss)
    return perplexity

# Repetition rate: how often model generates repetitive text
def compute_repetition_rate(generated_samples, ngram_size=10):
    repetition_count = 0

    for text in generated_samples:
        # Check if any ngram appears more than twice
        for i in range(len(text) - ngram_size):
            ngram = text[i:i+ngram_size]
            if text[i+ngram_size:].count(ngram) >= 2:
                repetition_count += 1
                break

    return repetition_count / len(generated_samples)

## 4.2 Generate samples for qualitative analysis

In [None]:
def generate_text(model, start_string, num_generate=1000):
    # Convert start string to token IDs
    input_indices = np.array(tokenizer.texts_to_sequences([start_string]))

    input_indices = tf.convert_to_tensor(input_indices, dtype=tf.int32)

    text_generated = []

    # Reset LSTM states
    for layer in model.layers:
        if hasattr(layer, 'reset_states'):
            layer.reset_states()

    for _ in range(num_generate):
        predictions = model(input_indices)
        predictions = tf.squeeze(predictions, 0)

        # Sample from categorical distribution
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Update input_indices for next step
        input_indices = tf.expand_dims([predicted_id], 0)

        # Convert token ID back to word
        next_word = tokenizer.sequences_to_texts(input_indices.numpy())[0]
        text_generated.append(next_word)

    return start_string + ' ' + ''.join(text_generated)

def generate_combinations(model):
    recipe_length = 500  # adjust length as desired
    ingredients = ['potato']

    for ingredient in ingredients:
        print(f'Ingredient: "{ingredient}"')
        print('-'*40)
        generated_text = generate_text(model, start_string=ingredient, num_generate=recipe_length)
        print(generated_text)
        print('\n\n')

# Run generation
generate_combinations(model_simplified)


Ingredient: "potato"
----------------------------------------
potato cates apple flavoured like wine strips)
‚Ä¢ 450 g pears or 100 g brown turkey prepared cherries
‚Ä¢ 150 ml scraped pizza crust
‚Ä¢ 8 eggs
‚Ä¢ 225 g ricotta cheese
‚Ä¢ 3 ices mayonnaise
‚Ä¢ 1/4 cup cheddar cheese, shredded


ü•£

‚ñ™Ô∏é preheat oven to 300¬∞f degrees.
‚ñ™Ô∏é preheat oven to 350 degrees.
‚ñ™Ô∏é in a large skillet, cook the croves and bay leaf noodles and oil or bowl.
‚ñ™Ô∏é add honey and cinnamon, all softer, except zucchini and cook an oven-edge of pan just until the, 15co time, for two minutes.
‚ñ™Ô∏é stir in cho





## 4.3 Run evaluation

In [None]:
def generate_samples(model, tokenizer, n_samples=20, num_generate=800, batch_size=1):
    """Generate recipe samples for evaluation"""
    # Build a generation model with batch_size=1
    gen_model = tf.keras.Sequential([
        tf.keras.Input(batch_shape=(1, None)),
        tf.keras.layers.Embedding(input_dim=model.layers[0].input_dim, output_dim=model.layers[0].output_dim,
                                   weights=model.layers[0].get_weights()),
        tf.keras.layers.LSTM(units=model.layers[1].units, return_sequences=True, stateful=True,
                            recurrent_initializer=tf.keras.initializers.GlorotNormal()),
        tf.keras.layers.Dense(model.layers[2].units)
    ])

    # Copy weights from trained model
    gen_model.layers[1].set_weights(model.layers[1].get_weights())
    gen_model.layers[2].set_weights(model.layers[2].get_weights())

    prompts = [
        "‚≠êÔ∏è chocolate chip cookies",
        "‚≠êÔ∏è pasta",
        "‚≠êÔ∏è chicken",
        "‚≠êÔ∏è salad",
        "‚≠êÔ∏è soup"
    ]

    samples = []

    for i in range(n_samples):
        prompt = prompts[i % len(prompts)]

        input_indices = np.array(tokenizer.texts_to_sequences([prompt]))
        input_indices = tf.convert_to_tensor(input_indices, dtype=tf.int32)

        text_generated = []

        # Reset LSTM states
        for layer in gen_model.layers:
            if hasattr(layer, 'reset_states'):
                layer.reset_states()

        for _ in range(num_generate):
            predictions = gen_model(input_indices)
            predictions = tf.squeeze(predictions, 0)

            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            input_indices = tf.expand_dims([predicted_id], 0)
            next_word = tokenizer.sequences_to_texts(input_indices.numpy())[0]
            text_generated.append(next_word)

        samples.append(prompt + ''.join(text_generated))

    return samples

# Perplexity (model confidence)
perplexity = compute_perplexity(model, test_ds, num_batches=100)
print(f"Perplexity:{perplexity:.2f}")

# Generate samples
samples = generate_samples(model, tokenizer, n_samples=20)

# Repetition rate
repetition_rate = compute_repetition_rate(samples, ngram_size=10)
print(f"Repetition rate:{repetition_rate:.1%}")