# RNN Test: Recetas de cocina

Basado en: https://www.kdnuggets.com/2020/07/generating-cooking-recipes-using-tensorflow.html

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import json

import platform
import time
import pathlib
import os

# Obtener los datos

In [2]:
CACHE_DIR = './tmp'
pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)

dataset_file_name = 'recipes_raw.zip'
dataset_file_origin = 'https://storage.googleapis.com/recipe-box/recipes_raw.zip'

dataset_file_path = tf.keras.utils.get_file(
    fname=dataset_file_name,
    origin=dataset_file_origin,
    cache_dir=CACHE_DIR,
    extract=True,
    archive_format='zip'
)

print(dataset_file_path)

./tmp\datasets\recipes_raw.zip


In [3]:
!dir "tmp/datasets"

 El volumen de la unidad C es Windows
 El n£mero de serie del volumen es: 9E21-842D

 Directorio de C:\Users\Chris-Brota\Desktop\Scripts_Brota\RNN_tests\tmp\datasets

21-07-2021  15:33    <DIR>          .
21-07-2021  15:33    <DIR>          ..
23-07-2021  16:16            20.437 LICENSE
21-07-2021  15:33        53.355.492 recipes_raw.zip
23-07-2021  16:16        49.784.325 recipes_raw_nosource_ar.json
23-07-2021  16:16        61.133.971 recipes_raw_nosource_epi.json
23-07-2021  16:16        93.702.755 recipes_raw_nosource_fn.json
               5 archivos    257.996.980 bytes
               2 dirs  163.900.944.384 bytes libres


# Cargando los datos

In [4]:
def load_dataset(silent=False):
    # List of dataset files we want to merge.
    dataset_file_names = [
        'recipes_raw_nosource_ar.json',
        'recipes_raw_nosource_epi.json',
        'recipes_raw_nosource_fn.json',
    ]
    
    dataset = []

    for dataset_file_name in dataset_file_names:
        dataset_file_path = f'{CACHE_DIR}/datasets/{dataset_file_name}'

        with open(dataset_file_path) as dataset_file:
            json_data_dict = json.load(dataset_file)
            json_data_list = list(json_data_dict.values())
            dict_keys = [key for key in json_data_list[0]]
            dict_keys.sort()
            dataset += json_data_list

            # This code block outputs the summary for each dataset.
            if silent == False:
                print(dataset_file_path)
                print('===========================================')
                print('Number of examples: ', len(json_data_list), '\n')
                print('Example object keys:\n', dict_keys, '\n')
                print('Example object:\n', json_data_list[0], '\n')
                print('Required keys:\n')
                print('  title: ', json_data_list[0]['title'], '\n')
                print('  ingredients: ', json_data_list[0]['ingredients'], '\n')
                print('  instructions: ', json_data_list[0]['instructions'])
                print('\n\n')

    return dataset  

dataset_raw = load_dataset() 

./tmp/datasets/recipes_raw_nosource_ar.json
Number of examples:  39802 

Example object keys:
 ['ingredients', 'instructions', 'picture_link', 'title'] 

Example object:
 {'title': 'Slow Cooker Chicken and Dumplings', 'ingredients': ['4 skinless, boneless chicken breast halves ADVERTISEMENT', '2 tablespoons butter ADVERTISEMENT', '2 (10.75 ounce) cans condensed cream of chicken soup ADVERTISEMENT', '1 onion, finely diced ADVERTISEMENT', '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ADVERTISEMENT', 'ADVERTISEMENT'], 'instructions': 'Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\nCover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.\n', 'picture_link': '55lznCYBbs2mT8BTx6BTkLhynGHzM.S'} 

Required keys:

  title:  Slow Cooker Chicken and Dumplings 

  ingredients:  ['4 skinless, boneless chicke

In [5]:
print('Total number of raw examples: ', len(dataset_raw))

Total number of raw examples:  125164


# Validando y podando los datos

In [6]:
def recipe_validate_required_fields(recipe):
    required_keys = ['title', 'ingredients', 'instructions']
    
    if not recipe:
        return False
    
    for required_key in required_keys:
        if not recipe[required_key]:
            return False
        
        if type(recipe[required_key]) == list and len(recipe[required_key]) == 0:
            return False
    
    return True

In [7]:
dataset_validated = [recipe for recipe in dataset_raw if recipe_validate_required_fields(recipe)]

print('Dataset size BEFORE validation', len(dataset_raw))
print('Dataset size AFTER validation', len(dataset_validated))
print('Number of incomplete recipes', len(dataset_raw) - len(dataset_validated))

Dataset size BEFORE validation 125164
Dataset size AFTER validation 122938
Number of incomplete recipes 2226


# Acondicionar el texto: agregar puntos de referencia para facilitar entrenamiento RNN

In [8]:
STOP_WORD_TITLE = '📗 '
STOP_WORD_INGREDIENTS = '\n🥕\n\n'
STOP_WORD_INSTRUCTIONS = '\n📝\n\n'

In [9]:
def recipe_to_string(recipe):
    # This string is presented as a part of recipes so we need to clean it up.
    noize_string = 'ADVERTISEMENT'
    
    title = recipe['title']
    ingredients = recipe['ingredients']
    instructions = recipe['instructions'].split('\n')
    
    ingredients_string = ''
    for ingredient in ingredients:
        ingredient = ingredient.replace(noize_string, '')
        if ingredient:
            ingredients_string += f'• {ingredient}\n'
    
    instructions_string = ''
    for instruction in instructions:
        instruction = instruction.replace(noize_string, '')
        if instruction:
            instructions_string += f'▪︎ {instruction}\n'
    
    return f'{STOP_WORD_TITLE}{title}\n{STOP_WORD_INGREDIENTS}{ingredients_string}'\
           f'{STOP_WORD_INSTRUCTIONS}{instructions_string}'

In [10]:
dataset_stringified = [recipe_to_string(recipe) for recipe in dataset_validated]
print('Stringified dataset size: ', len(dataset_stringified))

Stringified dataset size:  122938


In [11]:
for recipe_index, recipe_string in enumerate(dataset_stringified[:3]):
    print('Recipe #{}\n---------'.format(recipe_index + 1))
    print(recipe_string)
    print('\n')
    
print(dataset_stringified[50000])

Recipe #1
---------
📗 Slow Cooker Chicken and Dumplings

🥕

• 4 skinless, boneless chicken breast halves 
• 2 tablespoons butter 
• 2 (10.75 ounce) cans condensed cream of chicken soup 
• 1 onion, finely diced 
• 2 (10 ounce) packages refrigerated biscuit dough, torn into pieces 

📝

▪︎ Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.
▪︎ Cover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.



Recipe #2
---------
📗 Awesome Slow Cooker Pot Roast

🥕

• 2 (10.75 ounce) cans condensed cream of mushroom soup 
• 1 (1 ounce) package dry onion soup mix 
• 1 1/4 cups water 
• 5 1/2 pounds pot roast 

📝

▪︎ In a slow cooker, mix cream of mushroom soup, dry onion soup mix and water. Place pot roast in slow cooker and coat with soup mixture.
▪︎ Cook on High setting for 3 to 4 hours, or on Low setting for 8 to 9 hours.



Recipe 

# Filtrando el largo de las recetas

In [None]:
recipes_lengths = []
for recipe_text in dataset_stringified:
    recipes_lengths.append(len(recipe_text))

plt.hist(recipes_lengths, bins=50)
plt.show()

In [None]:
plt.hist(recipes_lengths, range=(0, 8000), bins=50)
plt.show()

In [None]:
MAX_RECIPE_LENGTH = 2000

def filter_recipes_by_length(recipe_test):
    return len(recipe_test) <= MAX_RECIPE_LENGTH 

dataset_filtered = [recipe_text for recipe_text in dataset_stringified if filter_recipes_by_length(recipe_text)]

print('Dataset size BEFORE filtering: ', len(dataset_stringified))
print('Dataset size AFTER filtering: ', len(dataset_filtered))
print('Number of eliminated recipes: ', len(dataset_stringified) - len(dataset_filtered)) 

In [None]:
TOTAL_RECIPES_NUM = len(dataset_filtered)

print('MAX_RECIPE_LENGTH: ', MAX_RECIPE_LENGTH)
print('TOTAL_RECIPES_NUM: ', TOTAL_RECIPES_NUM)

# Creación del vocabulario

In [None]:
STOP_SIGN = '␣'

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=True,
    filters='',
    lower=False,
    split=''
)

# Stop word is not a part of recipes, but tokenizer must know about it as well.
tokenizer.fit_on_texts([STOP_SIGN])

tokenizer.fit_on_texts(dataset_filtered)

tokenizer.get_config()

In [None]:
# Se agrega uno al vocabulario ya que el índice 0 corresponde a un índice 
# reservado que no se asigna a ninguna palabra
VOCABULARY_SIZE = len(tokenizer.word_counts) + 1
print('VOCABULARY_SIZE: ', VOCABULARY_SIZE)

In [None]:
print(tokenizer.index_word[5])
print(tokenizer.index_word[8])
print(tokenizer.index_word[20])

print(tokenizer.word_index['r'])

In [None]:
# Esto permite ilustrar que caracteres están en las recetas en nuestro dataset, 
# los cuales se printean en un dataset
array_vocabulary = tokenizer.sequences_to_texts([[word_index] 
                                                 for word_index in range(VOCABULARY_SIZE)])
print([char for char in array_vocabulary])

In [None]:
# Ahora pasando un texto a índices
tokenizer.texts_to_sequences(['📗 yes'])

# Vectorizando el conjunto de datos

In [21]:
dataset_vectorized = tokenizer.texts_to_sequences(dataset_filtered)

print('Vectorized dataset size', len(dataset_vectorized)) 

Vectorized dataset size 100212


In [22]:
print(dataset_vectorized[0][:10], '...') 

[51, 1, 33, 10, 5, 23, 1, 35, 5, 5] ...


In [23]:
def recipe_sequence_to_string(recipe_sequence):
    recipe_stringified = tokenizer.sequences_to_texts([recipe_sequence])[0]
    print(recipe_stringified)

recipe_sequence_to_string(dataset_vectorized[0])

📗   S l o w   C o o k e r   C h i c k e n   a n d   D u m p l i n g s 
 
 🥕 
 
 •   4   s k i n l e s s ,   b o n e l e s s   c h i c k e n   b r e a s t   h a l v e s   
 •   2   t a b l e s p o o n s   b u t t e r   
 •   2   ( 1 0 . 7 5   o u n c e )   c a n s   c o n d e n s e d   c r e a m   o f   c h i c k e n   s o u p   
 •   1   o n i o n ,   f i n e l y   d i c e d   
 •   2   ( 1 0   o u n c e )   p a c k a g e s   r e f r i g e r a t e d   b i s c u i t   d o u g h ,   t o r n   i n t o   p i e c e s   
 
 📝 
 
 ▪ ︎   P l a c e   t h e   c h i c k e n ,   b u t t e r ,   s o u p ,   a n d   o n i o n   i n   a   s l o w   c o o k e r ,   a n d   f i l l   w i t h   e n o u g h   w a t e r   t o   c o v e r . 
 ▪ ︎   C o v e r ,   a n d   c o o k   f o r   5   t o   6   h o u r s   o n   H i g h .   A b o u t   3 0   m i n u t e s   b e f o r e   s e r v i n g ,   p l a c e   t h e   t o r n   b i s c u i t   d o u g h   i n   t h e   s l o w   c o o k e r .   C o o k   u n 

# Hacer que las recetas tengan el mismo largo (padding)

In [24]:
for recipe_index, recipe in enumerate(dataset_vectorized[:10]):
    print('Recipe #{} length: {}'.format(recipe_index + 1, len(recipe)))

Recipe #1 length: 546
Recipe #2 length: 401
Recipe #3 length: 671
Recipe #4 length: 736
Recipe #5 length: 1518
Recipe #6 length: 740
Recipe #7 length: 839
Recipe #8 length: 667
Recipe #9 length: 1264
Recipe #10 length: 854


In [25]:
dataset_vectorized_padded_without_stops = tf.keras.preprocessing.sequence.pad_sequences(
    dataset_vectorized,
    padding='post',
    truncating='post',
    # We use -1 here and +1 in the next step to make sure
    # that all recipes will have at least 1 stops sign at the end,
    # since each sequence will be shifted and truncated afterwards
    # (to generate X and Y sequences).
    maxlen=MAX_RECIPE_LENGTH-1,
    value=tokenizer.texts_to_sequences([STOP_SIGN])[0])

dataset_vectorized_padded = tf.keras.preprocessing.sequence.pad_sequences(
    dataset_vectorized_padded_without_stops,
    padding='post',
    truncating='post',
    maxlen=MAX_RECIPE_LENGTH+1,
    value=tokenizer.texts_to_sequences([STOP_SIGN])[0])

for recipe_index, recipe in enumerate(dataset_vectorized_padded[:10]):
    print('Recipe #{} length: {}'.format(recipe_index, len(recipe)))

Recipe #0 length: 2001
Recipe #1 length: 2001
Recipe #2 length: 2001
Recipe #3 length: 2001
Recipe #4 length: 2001
Recipe #5 length: 2001
Recipe #6 length: 2001
Recipe #7 length: 2001
Recipe #8 length: 2001
Recipe #9 length: 2001


In [27]:
recipe_sequence_to_string(dataset_vectorized_padded[0])

📗   S l o w   C o o k e r   C h i c k e n   a n d   D u m p l i n g s 
 
 🥕 
 
 •   4   s k i n l e s s ,   b o n e l e s s   c h i c k e n   b r e a s t   h a l v e s   
 •   2   t a b l e s p o o n s   b u t t e r   
 •   2   ( 1 0 . 7 5   o u n c e )   c a n s   c o n d e n s e d   c r e a m   o f   c h i c k e n   s o u p   
 •   1   o n i o n ,   f i n e l y   d i c e d   
 •   2   ( 1 0   o u n c e )   p a c k a g e s   r e f r i g e r a t e d   b i s c u i t   d o u g h ,   t o r n   i n t o   p i e c e s   
 
 📝 
 
 ▪ ︎   P l a c e   t h e   c h i c k e n ,   b u t t e r ,   s o u p ,   a n d   o n i o n   i n   a   s l o w   c o o k e r ,   a n d   f i l l   w i t h   e n o u g h   w a t e r   t o   c o v e r . 
 ▪ ︎   C o v e r ,   a n d   c o o k   f o r   5   t o   6   h o u r s   o n   H i g h .   A b o u t   3 0   m i n u t e s   b e f o r e   s e r v i n g ,   p l a c e   t h e   t o r n   b i s c u i t   d o u g h   i n   t h e   s l o w   c o o k e r .   C o o k   u n 

# Transformar a `Tensorflow Dataset`

In [28]:
dataset = tf.data.Dataset.from_tensor_slices(dataset_vectorized_padded)
print(dataset)

<TensorSliceDataset shapes: (2001,), types: tf.int32>


In [29]:
for recipe in dataset.take(1):
    print('Raw recipe:\n', recipe.numpy(), '\n\n\n')
    print('Stringified recipe:\n')
    recipe_sequence_to_string(recipe.numpy())

Raw recipe:
 [ 51   1  33 ... 165 165 165] 



Stringified recipe:

📗   S l o w   C o o k e r   C h i c k e n   a n d   D u m p l i n g s 
 
 🥕 
 
 •   4   s k i n l e s s ,   b o n e l e s s   c h i c k e n   b r e a s t   h a l v e s   
 •   2   t a b l e s p o o n s   b u t t e r   
 •   2   ( 1 0 . 7 5   o u n c e )   c a n s   c o n d e n s e d   c r e a m   o f   c h i c k e n   s o u p   
 •   1   o n i o n ,   f i n e l y   d i c e d   
 •   2   ( 1 0   o u n c e )   p a c k a g e s   r e f r i g e r a t e d   b i s c u i t   d o u g h ,   t o r n   i n t o   p i e c e s   
 
 📝 
 
 ▪ ︎   P l a c e   t h e   c h i c k e n ,   b u t t e r ,   s o u p ,   a n d   o n i o n   i n   a   s l o w   c o o k e r ,   a n d   f i l l   w i t h   e n o u g h   w a t e r   t o   c o v e r . 
 ▪ ︎   C o v e r ,   a n d   c o o k   f o r   5   t o   6   h o u r s   o n   H i g h .   A b o u t   3 0   m i n u t e s   b e f o r e   s e r v i n g ,   p l a c e   t h e   t o r n   b i s c u i t 

# Obtener las entradas y salidas de este sistema

In [30]:
def split_input_target(recipe):
    input_text = recipe[:-1]
    target_text = recipe[1:]
    
    return input_text, target_text

dataset_targeted = dataset.map(split_input_target)
print(dataset_targeted)

<MapDataset shapes: ((2000,), (2000,)), types: (tf.int32, tf.int32)>


In [31]:
for input_example, target_example in dataset_targeted.take(1):
    print('Input sequence size:', repr(len(input_example.numpy())))
    print('Target sequence size:', repr(len(target_example.numpy())))
    print()
    
    input_stringified = tokenizer.sequences_to_texts([input_example.numpy()[:50]])[0]
    target_stringified = tokenizer.sequences_to_texts([target_example.numpy()[:50]])[0]
    
    print('Input:  ', repr(''.join(input_stringified)))
    print('Target: ', repr(''.join(target_stringified))) 

Input sequence size: 2000
Target sequence size: 2000

Input:   '📗   S l o w   C o o k e r   C h i c k e n   a n d   D u m p l i n g s \n \n 🥕 \n \n •   4   s k i n l e'
Target:  '  S l o w   C o o k e r   C h i c k e n   a n d   D u m p l i n g s \n \n 🥕 \n \n •   4   s k i n l e s'


In [32]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:10], target_example[:10])):
    print('Step {:2d}'.format(i + 1))
    print('  input: {} ({:s})'.format(input_idx, repr(tokenizer.sequences_to_texts([[input_idx.numpy()]])[0])))
    print('  expected output: {} ({:s})'.format(target_idx, repr(tokenizer.sequences_to_texts([[target_idx.numpy()]])[0])))

Step  1
  input: 51 ('📗')
  expected output: 1 (' ')
Step  2
  input: 1 (' ')
  expected output: 33 ('S')
Step  3
  input: 33 ('S')
  expected output: 10 ('l')
Step  4
  input: 10 ('l')
  expected output: 5 ('o')
Step  5
  input: 5 ('o')
  expected output: 23 ('w')
Step  6
  input: 23 ('w')
  expected output: 1 (' ')
Step  7
  input: 1 (' ')
  expected output: 35 ('C')
Step  8
  input: 35 ('C')
  expected output: 5 ('o')
Step  9
  input: 5 ('o')
  expected output: 5 ('o')
Step 10
  input: 5 ('o')
  expected output: 25 ('k')


# Separar la base de datos en batches

In [33]:
# Batch size.
BATCH_SIZE = 64

# Buffer size to shuffle the dataset (TF data is designed to work
# with possibly infinite sequences, so it doesn't attempt to shuffle
# the entire sequence in memory. Instead, it maintains a buffer in
# which it shuffles elements).
SHUFFLE_BUFFER_SIZE = 1000

dataset_train = dataset_targeted.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).repeat()

print(dataset_train)

<RepeatDataset shapes: ((64, 2000), (64, 2000)), types: (tf.int32, tf.int32)>


In [34]:
for input_text, target_text in dataset_train.take(1):
    print('1st batch: input_text:', input_text)
    print()
    print('1st batch: target_text:', target_text)

1st batch: input_text: tf.Tensor(
[[ 51   1  33 ... 165 165 165]
 [ 51   1  45 ... 165 165 165]
 [ 51   1  35 ... 165 165 165]
 ...
 [ 51   1  44 ... 165 165 165]
 [ 51   1  63 ... 165 165 165]
 [ 51   1  35 ... 165 165 165]], shape=(64, 2000), dtype=int32)

1st batch: target_text: tf.Tensor(
[[  1  33  10 ... 165 165 165]
 [  1  45   3 ... 165 165 165]
 [  1  35   3 ... 165 165 165]
 ...
 [  1  44  23 ... 165 165 165]
 [  1  63   7 ... 165 165 165]
 [  1  35   8 ... 165 165 165]], shape=(64, 2000), dtype=int32)
