In [28]:
import random

In [None]:
random.seed(1234)

In [None]:
def title_generator_v1(main, aux, adj):
    '''
    Generates a new recipe title based on existing data.
    Parameters:
        main (list): list of ingredients that can be the central ingredient
        aux (list): extra ingredients (the supporting cast)
        adj (list): potential adjectives to describe the dish
    Returns:
        string object containing the new recipe title.
    '''
    
    ingrMain = random.randint(0, len(main) - 1)
    ingrAux = random.randint(0, len(aux) - 1)
    ingrAdj = random.randint(0, len(adj) -1)
    
    return adj[ingrAdj] + " " + main[ingrMain] + " with " + aux[ingrAux]

In [None]:
# create fake data
main = ["meatballs", "steak", "pasta", "lasagna", "ramen", "sandwich"]
aux = ["spinach", "noodles", "rice", "vegetables", "carrots"]
adj = ["curried", "epic", "roasted", "baked", "fried", "boiled", "creamed"]

In [None]:
new_recipe = title_generator_v1(main, aux, adj)
print(new_recipe)

In [None]:
# onto level 2

In [1]:
import pandas as pd
import sqlite3
import numpy as np

In [2]:
def title_generator_v2(start):
    '''
    Generates a new recipe title based on bigger set of raw data.
    Parameters:
        start (str): keyword for parsing the recipe data
    Returns:
        string object with new recipe title
    '''
    
    with sqlite3.connect("recipes1M.db") as conn:
        cmd = \
        f"""
        SELECT R.title
        FROM recipes R
        WHERE R.title LIKE "%{start}%"
        """
        df = pd.read_sql_query(cmd, conn)
    
    
    
    
    return df

In [None]:
title_generator_v2("meatball")

In [None]:
# learn about word embeddings (a strategy for encoding text for ML)
# https://www.tensorflow.org/tutorials/text/word_embeddings

In [None]:
# get some cleaned data
with sqlite3.connect("recipes1M.db") as conn:
    cmd = \
    f"""
    SELECT R.title
    FROM recipes R
    WHERE R.title LIKE "%potato%"
    """
    df = pd.read_sql_query(cmd, conn)

df.head(5)

In [3]:
#import io
#import os
#import re
#import shutil
#import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [None]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

In [None]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((df.values))

In [None]:
# the following code was taken from
# https://www.kdnuggets.com/2020/07/generating-cooking-recipes-using-tensorflow.html
# and has been lightly modified

In [None]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = dataset.map(lambda x: x)
vectorize_layer.adapt(text_ds)

In [None]:
# title generation with RNNs

In [4]:
df = title_generator_v2("salmon")
df.head()

Unnamed: 0,title
0,Salmon & Salad a La SPORTZ
1,Curried Pumpkin and Smoked Salmon Soup
2,Grilled Rosemary Salmon Spedini
3,Garlic and Dill Salmon
4,Spicy Grilled Orange Salmon


In [8]:
df.shape

(10793, 1)

In [None]:
# convert to numbers
# pass into RNN

In [18]:
listOfWords = []
for i in range(0, df.shape[0]):
    curr = df["title"][i]
    listOfWords += curr.split()
    
setOfWords = set(listOfWords)

In [19]:
setOfWords

# strip punctuation and non-words

{'Craze-E',
 '4',
 'Ragu',
 'Rawas)',
 'sensation!',
 'Horseradish',
 'Tented',
 'Parpadelle',
 'Pesto-Crusted',
 'Slow',
 'Bean-Ancho',
 'Cream,',
 'Olive-Oil-Poached',
 '(Red',
 'Sub',
 'Dogs',
 'Charcoal',
 'Rye',
 "Ray's",
 'Settlement',
 'Dijonnaise',
 'Flat',
 'Dijon-Walnut',
 'Chive',
 'Crab)',
 'Pan-seared',
 'Evening',
 'Pinto',
 'Miso-Marinated',
 'Peoples',
 'Shallots',
 'Won',
 'Hoisin,',
 'Glenmachrie',
 'Shio-Koji',
 'Curry-Rubbed',
 'Tasso-Cured',
 '"meringues"',
 '(Lohipasteijat)',
 'Lemon-Dill-Caper',
 'Broth',
 'Most',
 'Bow',
 'Mughlai',
 "AB's",
 'Omelette',
 'Banh',
 'Scallop',
 'Shiso-flavored',
 'AVOCADO',
 'Gravlax',
 'Dukkah-Crusted',
 'Chile-Ginger',
 'Prunes',
 'Francais',
 'Trout,',
 'Toast)',
 'Slimmed',
 '(Lohitartar)',
 'sea',
 "Goodrich's",
 'Dill',
 'WAKAME',
 'Javanese',
 'Brei',
 'Snap',
 'Pine-Smoked',
 'New',
 'Mean',
 'Salmon-Scrambled',
 "d'Espelette",
 'Mustard-Roasted',
 'Chive-Mustard',
 'dijon',
 'Dill)',
 'Butterflied',
 'Us',
 'Croquettes',


In [35]:
textForMarkov = ""

for i in range(0, df.shape[0] - 10000):
    curr = df["title"][i]
    textForMarkov += curr + " "

len(textForMarkov)

26317

In [27]:
textForMarkov[0: 100]

'Salmon & Salad a La SPORTZ Curried Pumpkin and Smoked Salmon Soup Grilled Rosemary Salmon Spedini Ga'

In [55]:
def markovText(corpus, seed, n, length):
    nplus1 = {}
    for i in range(0, len(corpus)):
        if corpus[i:n+1] in nplus1.keys():
            nplus1[corpus[i:n+1]] += 1
        elif len(corpus[i:i+n]) == n:
            nplus1[corpus[i:n+1]] = 1
    
    print("starting text generation")
    
    fake_text = seed
    while len(fake_text) < length:
        print(len(fake_text))
        for keys in nplus1:
            #if keys == fake_text[-1 - n:-1]:
                
                choice = random.choices(list(nplus1.keys()), weights = nplus1.values())
                print(choice)
                fake_text += choice[len(choice) - 1]

    return fake_text

In [56]:
newText = markovText(textForMarkov, "salmon", 5, 1000)

starting text generation
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
['']
['']
['']
['']
['']
6
['']
['']
[

['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']
['']
['']
['']
['']
['']
['']
10
['']

['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['

['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['']
['']
['']
11
['']
['']
['']
['']
['on']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']


['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['']
['']
['']
['']
['']
13
['']
['']
['lmon']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
['']
['']
['']
['']
17
['']
['']
['']
[''

KeyboardInterrupt: 

In [73]:
stop = "]"

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level = True,
    filters = '',
    lower = True,
    split = ''
)

# Stop word is not a part of recipes, but tokenizer must know about it as well.
#tokenizer.fit_on_texts([stop])



tokenizer.fit_on_texts(listOfWords)

tokenizer.get_config()

{'num_words': None,
 'filters': '',
 'lower': True,
 'split': '',
 'char_level': True,
 'oov_token': None,
 'document_count': 53159,
 'word_counts': '{"s": 29168, "a": 35510, "l": 23174, "m": 17976, "o": 24193, "n": 22397, "&": 407, "d": 13393, "p": 7343, "r": 15273, "t": 14903, "z": 992, "c": 10783, "u": 6601, "i": 17435, "e": 27854, "k": 4621, "g": 5225, "y": 2786, "w": 5985, "h": 8825, "b": 4885, ",": 765, "v": 1405, "f": 2550, "\'": 421, "(": 277, "3": 8, ")": 269, "j": 312, "/": 105, "x": 115, ".": 70, "q": 441, "-": 1936, "\\"": 81, "!": 62, "[": 1, "]": 1, "#": 18, "5": 22, "8": 1, "4": 5, "*": 10, ":": 36, "9": 1, "1": 17, "0": 14, "7": 3, "2": 24, "?": 1, "=": 1, "_": 1, "6": 1, ";": 1, "`": 1}',
 'word_docs': '{"a": 30819, "s": 26928, "o": 22106, "l": 20517, "n": 21417, "m": 17589, "&": 407, "d": 13086, "z": 877, "p": 6401, "r": 13815, "t": 12624, "i": 16085, "e": 22429, "c": 9902, "u": 6144, "k": 4590, "g": 4635, "y": 2752, "h": 8536, "w": 5958, "b": 4404, ",": 765, "v": 139

In [59]:
tokenizer.word_counts

OrderedDict([('s', 29168),
             ('a', 35510),
             ('l', 23174),
             ('m', 17976),
             ('o', 24193),
             ('n', 22397),
             ('&', 407),
             ('d', 13393),
             ('p', 7343),
             ('r', 15273),
             ('t', 14903),
             ('z', 992),
             ('c', 10783),
             ('u', 6601),
             ('i', 17435),
             ('e', 27854),
             ('k', 4621),
             ('g', 5225),
             ('y', 2786),
             ('w', 5985),
             ('h', 8825),
             ('b', 4885),
             (',', 765),
             ('v', 1405),
             ('f', 2550),
             ("'", 421),
             ('(', 277),
             ('3', 8),
             (')', 269),
             ('j', 312),
             ('/', 105),
             ('x', 115),
             ('.', 70),
             ('q', 441),
             ('-', 1936),
             ('"', 81),
             ('!', 62),
             ('[', 1),
             (']', 1),

In [60]:
VOCABULARY_SIZE = len(tokenizer.word_counts) + 1

print('VOCABULARY_SIZE: ', VOCABULARY_SIZE)

VOCABULARY_SIZE:  57


In [61]:
array_vocabulary = tokenizer.sequences_to_texts([[word_index] for word_index in range(VOCABULARY_SIZE)])
print([char for char in array_vocabulary])

['', 'a', 's', 'e', 'o', 'l', 'n', 'm', 'i', 'r', 't', 'd', 'c', 'h', 'p', 'u', 'w', 'g', 'b', 'k', 'y', 'f', '-', 'v', 'z', ',', 'q', "'", '&', 'j', '(', ')', 'x', '/', '"', '.', '!', ':', '2', '5', '#', '1', '0', '*', '3', '4', '7', '[', ']', '8', '9', '?', '=', '_', '6', ';', '`']


In [62]:
dataset_vectorized = tokenizer.texts_to_sequences(listOfWords)

print('Vectorized dataset size', len(dataset_vectorized)) 

Vectorized dataset size 53159


In [63]:
dataset_vectorized

[[2, 1, 5, 7, 4, 6],
 [28],
 [2, 1, 5, 1, 11],
 [1],
 [5, 1],
 [2, 14, 4, 9, 10, 24],
 [12, 15, 9, 9, 8, 3, 11],
 [14, 15, 7, 14, 19, 8, 6],
 [1, 6, 11],
 [2, 7, 4, 19, 3, 11],
 [2, 1, 5, 7, 4, 6],
 [2, 4, 15, 14],
 [17, 9, 8, 5, 5, 3, 11],
 [9, 4, 2, 3, 7, 1, 9, 20],
 [2, 1, 5, 7, 4, 6],
 [2, 14, 3, 11, 8, 6, 8],
 [17, 1, 9, 5, 8, 12],
 [1, 6, 11],
 [11, 8, 5, 5],
 [2, 1, 5, 7, 4, 6],
 [2, 14, 8, 12, 20],
 [17, 9, 8, 5, 5, 3, 11],
 [4, 9, 1, 6, 17, 3],
 [2, 1, 5, 7, 4, 6],
 [9, 4, 1, 2, 10],
 [2, 1, 5, 7, 4, 6],
 [16, 8, 10, 13],
 [2, 14, 8, 12, 3, 11],
 [12, 4, 12, 4, 6, 15, 10],
 [12, 9, 15, 7, 18, 2],
 [18, 1, 19, 3, 11],
 [2, 1, 5, 7, 4, 6],
 [16, 8, 10, 13],
 [10, 4, 7, 1, 10, 4, 3, 2, 25],
 [2, 14, 8, 6, 1, 12, 13],
 [28],
 [7, 15, 2, 13, 9, 4, 4, 7, 2],
 [17, 9, 8, 5, 5, 3, 11],
 [2, 1, 5, 7, 4, 6, 25],
 [6, 4, 9, 10, 13, 16, 3, 2, 10],
 [2, 10, 20, 5, 3],
 [2, 7, 4, 19, 3, 11],
 [2, 1, 5, 7, 4, 6],
 [28],
 [12, 13, 8, 23, 3],
 [12, 9, 3, 1, 7],
 [12, 13, 3, 3, 2, 3],
 [12, 9, 

In [74]:
MAX_RECIPE_LENGTH = 1000

dataset_vectorized_padded_without_stops = tf.keras.preprocessing.sequence.pad_sequences(
    dataset_vectorized,
    padding='post',
    truncating='post',
    # We use -1 here and +1 in the next step to make sure
    # that all recipes will have at least 1 stops sign at the end,
    # since each sequence will be shifted and truncated afterwards
    # (to generate X and Y sequences).
    maxlen=MAX_RECIPE_LENGTH-1,
    value=tokenizer.texts_to_sequences([stop])[0]
)

dataset_vectorized_padded = tf.keras.preprocessing.sequence.pad_sequences(
    dataset_vectorized_padded_without_stops,
    padding='post',
    truncating='post',
    maxlen=MAX_RECIPE_LENGTH+1,
    value=tokenizer.texts_to_sequences([stop])[0]
)

for recipe_index, recipe in enumerate(dataset_vectorized_padded[:10]):
    print('Recipe #{} length: {}'.format(recipe_index, len(recipe)))

Recipe #0 length: 1001
Recipe #1 length: 1001
Recipe #2 length: 1001
Recipe #3 length: 1001
Recipe #4 length: 1001
Recipe #5 length: 1001
Recipe #6 length: 1001
Recipe #7 length: 1001
Recipe #8 length: 1001
Recipe #9 length: 1001


In [67]:
tokenizer.texts_to_sequences([stop])[0]

[]

In [75]:
dataset = tf.data.Dataset.from_tensor_slices(dataset_vectorized)

print(dataset)

ValueError: Can't convert non-rectangular Python sequence to Tensor.

In [None]:
# end of code from
# https://www.kdnuggets.com/2020/07/generating-cooking-recipes-using-tensorflow.html