In [None]:
import random

random.seed(1234)

def title_generator_v1(main, aux, adj):
    '''
    Generates a new recipe title based on existing data.
    Parameters:
        main (list): list of ingredients that can be the central ingredient
        aux (list): extra ingredients (the supporting cast)
        adj (list): potential adjectives to describe the dish
    Returns:
        string object containing the new recipe title.
    '''
    
    ingrMain = random.randint(0, len(main) - 1)
    ingrAux = random.randint(0, len(aux) - 1)
    ingrAdj = random.randint(0, len(adj) -1)
    
    return adj[ingrAdj] + " " + main[ingrMain] + " with " + aux[ingrAux]

# create fake data
main = ["meatballs", "steak", "pasta", "lasagna", "ramen", "sandwich"]
aux = ["spinach", "noodles", "rice", "vegetables", "carrots"]
adj = ["curried", "epic", "roasted", "baked", "fried", "boiled", "creamed"]

new_recipe = title_generator_v1(main, aux, adj)
print(new_recipe)

In [None]:
import pandas as pd
import sqlite3
import numpy as np

def title_generator_v2(start):
    '''
    Generates a new recipe title based on bigger set of raw data.
    Parameters:
        start (str): keyword for parsing the recipe data
    Returns:
        string object with new recipe title
    '''
    
    with sqlite3.connect("recipes1M.db") as conn:
        cmd = \
        f"""
        SELECT R.title
        FROM recipes R
        WHERE R.title LIKE "%{start}%"
        """
        df = pd.read_sql_query(cmd, conn)
    
    return df

In [None]:
title_generator_v2("meatball")

In [None]:
# learn about word embeddings (a strategy for encoding text for ML)
# https://www.tensorflow.org/tutorials/text/word_embeddings

In [None]:
# get some cleaned data
with sqlite3.connect("recipes1M.db") as conn:
    cmd = \
    f"""
    SELECT R.title
    FROM recipes R
    WHERE R.title LIKE "%potato%"
    """
    df = pd.read_sql_query(cmd, conn)

df.head(5)

In [None]:
# the following code was taken from
# https://www.kdnuggets.com/2020/07/generating-cooking-recipes-using-tensorflow.html
# and has been lightly modified

In [None]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = dataset.map(lambda x: x)
vectorize_layer.adapt(text_ds)

In [None]:
df = title_generator_v2("salmon")
df.head()

In [None]:
# convert to numbers
# pass into RNN

In [None]:
listOfWords = []
for i in range(0, df.shape[0]):
    curr = df["title"][i]
    listOfWords += curr.split()
    
setOfWords = set(listOfWords)

In [None]:
stop = "]"

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level = True,
    filters = '',
    lower = True,
    split = ''
)

# Stop word is not a part of recipes, but tokenizer must know about it as well.
#tokenizer.fit_on_texts([stop])



tokenizer.fit_on_texts(listOfWords)

tokenizer.get_config()

In [None]:
tokenizer.word_counts

In [None]:
VOCABULARY_SIZE = len(tokenizer.word_counts) + 1

print('VOCABULARY_SIZE: ', VOCABULARY_SIZE)

In [None]:
array_vocabulary = tokenizer.sequences_to_texts([[word_index] for word_index in range(VOCABULARY_SIZE)])
print([char for char in array_vocabulary])

In [None]:
dataset_vectorized = tokenizer.texts_to_sequences(listOfWords)

print('Vectorized dataset size', len(dataset_vectorized)) 

In [None]:
dataset_vectorized

In [None]:
MAX_RECIPE_LENGTH = 1000

dataset_vectorized_padded_without_stops = tf.keras.preprocessing.sequence.pad_sequences(
    dataset_vectorized,
    padding='post',
    truncating='post',
    # We use -1 here and +1 in the next step to make sure
    # that all recipes will have at least 1 stops sign at the end,
    # since each sequence will be shifted and truncated afterwards
    # (to generate X and Y sequences).
    maxlen=MAX_RECIPE_LENGTH-1,
    value=tokenizer.texts_to_sequences([stop])[0]
)

dataset_vectorized_padded = tf.keras.preprocessing.sequence.pad_sequences(
    dataset_vectorized_padded_without_stops,
    padding='post',
    truncating='post',
    maxlen=MAX_RECIPE_LENGTH+1,
    value=tokenizer.texts_to_sequences([stop])[0]
)

for recipe_index, recipe in enumerate(dataset_vectorized_padded[:10]):
    print('Recipe #{} length: {}'.format(recipe_index, len(recipe)))

In [None]:
tokenizer.texts_to_sequences([stop])[0]

In [None]:
# end of code from
# https://www.kdnuggets.com/2020/07/generating-cooking-recipes-using-tensorflow.html