# Introduction

This Jupyter Notebook allows the user to define and train recipe generation models from scratch. If you wish to simply create new recipes using a pre-trained model, use the Flask website instead.

In [None]:
# import packages

# general
import numpy as np
import pandas as pd

# data preparation
import sqlite3 # connect to database .db files
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

# modeling
import pathlib # for setting up checkpoint directory
import os # ditto

import sys
sys.path.insert(0, './py')
import train # get pre-defined functions

Set the constant `DATA_SIZE` to be the number of recipes you want to train with. We used 100,000 for training our own model. 

In [None]:
DATA_SIZE = 100 # number of recipes to train on
data_raw = train.import_data(DATA_SIZE) # acquire raw data from database

# each raw recipe is in three parts; this collects them all together
data_str = data_raw.apply(lambda x: train.condense(x.title, x.ingredients, x.instructions), axis = 1)

# remove recipes that are too short (defined to be < MAX_RECIPE_LENGTH = 2000)
data_filter = [recipe for recipe in data_str if train.filter(recipe)]

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters = '', # we do not want to filter our recipes
    lower = False, # we want the model to distinguish uppercase characters
    split = '', # we are using characters, not words
    char_level = True # we want a character-level RNN
)

# show the tokenizer all of the existing characters we have
tokenizer.fit_on_texts([train.STOP_SIGN])
tokenizer.fit_on_texts(data_filter)

VOCABULARY_SIZE = len(tokenizer.word_counts) + 1 # define vocabulary size
data_vec = tokenizer.texts_to_sequences(data_filter) # vectorize the data

# pad each recipe with train.STOP_SIGN until it reaches MAX_RECIPE_LENGTH
data_temp = tf.keras.preprocessing.sequence.pad_sequences(
    sequences = data_vec,
    maxlen = train.MAX_RECIPE_LENGTH - 1, # create room for stop signs at the end
    padding = "post",
    truncating = "post",
    value = tokenizer.texts_to_sequences([train.STOP_SIGN])[0]
)

data_pad = tf.keras.preprocessing.sequence.pad_sequences(
    sequences = data_temp,
    maxlen = train.MAX_RECIPE_LENGTH + 1, # add on the stop signs
    padding = "post",
    truncating = "post",
    value = tokenizer.texts_to_sequences([train.STOP_SIGN])[0]
)

dataset = tf.data.Dataset.from_tensor_slices(data_pad) # create TensorFlow Dataset
data_target = dataset.map(train.split_input_target) # split off first and last characters

# batches the data to save memory later
# shuffles and repeats in order to allow for infinite training (on the data end)
data_train = data_target.shuffle(train.SHUFFLE_BUFFER_SIZE).batch(train.BATCH_SIZE, drop_remainder = True).repeat()

# Model 1: LSTM

In this section, we define and train an LSTM model. Then, we save the weights for future use.

In [None]:
# define location to save weights
dir_lstm = "weights/lstm"
checkpoint = os.path.join(dir_lstm, "checkpoint_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint,
    save_weights_only = True
)

In [None]:
# constants; you can change these if you want
EMBEDDING_SIZE = 256
UNITS = 1024
EPOCHS = 5
INITIAL_EPOCH = 1
STEPS_PER_EPOCH = 1000

model_lstm = tf.keras.models.Sequential([
  layers.Embedding(input_dim = VOCABULARY_SIZE,
                   output_dim = EMBEDDING_SIZE,
                   batch_input_shape = [train.BATCH_SIZE, None]),
  layers.LSTM(units = UNITS,
              return_sequences = True,
              stateful = True,
              recurrent_initializer = tf.keras.initializers.GlorotNormal()),
  layers.Dense(VOCABULARY_SIZE)         
])

model_lstm.compile(optimizer = "adam", loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True))

In [None]:
# this will take a long time to run
history = model_lstm.fit(
    x = data_train,
    epochs = EPOCHS,
    steps_per_epoch = STEPS_PER_EPOCH,
    initial_epoch = INITIAL_EPOCH,
    callbacks = [checkpoint_callback],
    verbose = True
)

# Model 2: GRU

In this section we develop an alternative framework, the gated recurrent unit (GRU). This type of model usually produces better results in less time. The tradeoff is that in the long run, it probably won't do as well as the LSTM.

In [None]:
# define location to save weights
dir_gru = "weights/gru"
checkpoint = os.path.join(dir_gru, "checkpoint_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint,
    save_weights_only = True
)

In [None]:
# constants
EMBEDDING_SIZE = 512
UNITS = 1024

model_gru = tf.keras.models.Sequential([
  layers.Embedding(input_dim = VOCABULARY_SIZE,
                   output_dim = EMBEDDING_SIZE,
                   batch_input_shape = [train.BATCH_SIZE, None]),
  layers.GRU(units = UNITS,
             return_sequences = True,
             stateful = True),
  layers.Dense(VOCABULARY_SIZE)         
])

model_gru.compile(optimizer = "adam", loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True))

In [None]:
# redefine (if desired)
EPOCHS = 5
INITIAL_EPOCH = 1
STEPS_PER_EPOCH = 1000

history = model_gru.fit(
    x = data_train,
    epochs = 1,
    steps_per_epoch = 1,
    callbacks = [checkpoint_callback],
    verbose = True
)

# Text Generation

We construct new models that mirror the previous ones, except that the batch size has been set to 1. This will allow us to load the weights of the old models and generate new recipes, one at a time.

## LSTM

In [None]:
# redefine as needed
EMBEDDING_SIZE = 256
UNITS = 1024

generator_lstm = tf.keras.models.Sequential([
  layers.Embedding(input_dim = VOCABULARY_SIZE,
                   output_dim = EMBEDDING_SIZE,
                   batch_input_shape = [1, None]),
  layers.LSTM(units = UNITS,
              return_sequences = True,
              stateful = True,
              recurrent_initializer = tf.keras.initializers.GlorotNormal()),
  layers.Dense(VOCABULARY_SIZE)         
])

# load weights
generator_lstm.load_weights(tf.train.latest_checkpoint(dir_lstm)).expect_partial()
generator_lstm.build(tf.TensorShape([1, None]))

# you may see lots of warnings. Do not panic!

In [None]:
print(train.generate(generator_lstm, "rice", 100, 0.8, tokenizer))

## GRU

In [None]:
# redefine
EMBEDDING_SIZE = 512
UNITS = 1024

generator_gru = tf.keras.models.Sequential([
  layers.Embedding(input_dim = VOCABULARY_SIZE,
                   output_dim = EMBEDDING_SIZE,
                   batch_input_shape = [1, None]),
  layers.GRU(units = UNITS,
              return_sequences = True,
              stateful = True),
  layers.Dense(VOCABULARY_SIZE)         
])

# load weights
generator_gru.load_weights(tf.train.latest_checkpoint(dir_gru)).expect_partial()
generator_gru.build(tf.TensorShape([1, None]))

# you may see lots of warnings. Do not panic!

In [None]:
print(train.generate(generator_gru, "rice", 100, 0.8, tokenizer))

## Saving the Models

Next, save each generator using the following code. Please be sure to change the filepath as needed. We used these models to produce the generator on our web application.

In [None]:
# save both models
generator_GRU.save("temp/gru")
generator_LSTM.save("temp/lstm")

In [None]:
# re-create new models
generator_GRU_new = tf.keras.models.load_model("temp/gru")
generator_LSTM_new = tf.keras.models.load_model("temp/lstm")