# Preprocess Checklist

In this notebook we build the ingredient processing system using spaCy.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pathlib
import os

import re
import spacy
import pickle
from collections import namedtuple

from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [None]:
CACHE_DIR = './drive/Shared drives/Capstone/tmp'
pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)
dataset_path = os.path.join(CACHE_DIR, 'emoji_text_recipes.pkl')

In [None]:
if not os.path.exists(dataset_path):
    raise SystemExit("Run preprocess_rnn_word.ipynb to generate data file before continuing")
else:
    recipes = pd.read_pickle(dataset_path)

In [None]:
recipes

0         🍴 Slow Cooker Chicken and Dumplings\n\n🥑\n• 4 ...
1         🍴 Awesome Slow Cooker Pot Roast\n\n🥑\n• 2 (10....
2         🍴 Brown Sugar Meatloaf\n\n🥑\n• 1/2 cup packed ...
3         🍴 Best Chocolate Chip Cookies\n\n🥑\n• 1 cup bu...
4         🍴 Homemade Mac and Cheese Casserole\n\n🥑\n• 8 ...
                                ...                        
125158    🍴 Cream Horns\n\n🥑\n• 1 sheet frozen puff past...
125159    🍴 Summer Corn Salad\n\n🥑\n• 4 ears fresh corn\...
125160    🍴 Zucchini Stuffed Tomatoes\n\n🥑\n• 4 large pl...
125162    🍴 Chocolate Cake with Armagnac Ice Cream\n\n🥑\...
125163    🍴 Crabby Bisque\n\n🥑\n• 3 (10.5-ounce) cans re...
Length: 105789, dtype: object

In [None]:
# Makes the dataset small
recipes = recipes[:20000]

In [None]:
recipes[0]

'🍴 Slow Cooker Chicken and Dumplings\n\n🥑\n• 4 skinless, boneless chicken breast halves\n• 2 tablespoons butter\n• 2 (10.75 ounce) cans condensed cream of chicken soup\n• 1 onion, finely diced\n• 2 (10 ounce) packages refrigerated biscuit dough, torn into pieces\n\n🥣\n‣ Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\n‣ Cover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.'

# Process the recipes to get the ingredients!

And other operations necessary.

In [None]:
class IngredientProcessor(object):
  def __init__(self, use_cache=True):
    self.MEASURES = set(["tbsp", "tablespoon", "tablespoons",
            "tsp", "teaspoon", "teaspoons",
            "fl", "oz", "ounce", "ounces",
            "lb", "pound", "pounds",
            "cm", "centimeter", "centimeters", "centimetre", "centimetres",
            "inch", "inches",
            "can", "cans",
            "cup", "cups",
            "pint", "pints",
            "quart", "quarts",
            "gallon", "gallons",
            "ml", "milliliter", "millilitre", "milliliters", "millilitres",
            "l", "liter", "litre", "liters", "litres",
            "pkg", "package", "packages",
            "piece", "pieces",
            "slice", "slices",
            "small", "medium", "large",
            # "sliced", "diced", "minced", "chopped", "deveined",
            "extra"])
    self.CONTEXT_NEEDERS = set(["extract", "root", "sauce", "cream",
            "broth", "soup", "soda", "oil", "puree", "powder",
            "mix", "roast", "paste",
            "chip", "chips",
            "bean", "beans",
            "pepper", "peppers"
            "casing", "casings"])
    self.ADJECTIVES = set(['canned', 'softened', 'diced', 'chopped', 'semisweet', 'thawed', 'frozen', 'minced', 'peeled', 'seeded', 'prepared', \
                  'melted', 'pitted', 'uncooked', 'cooked', 'squeezed', 'lean', 'boneless', 'ground', 'divided', 'refrigerated', \
                  'skinless', 'crushed', 'grated', 'trimmed', 'crushed', 'sifted', 'all-purpose', 'allpurpose', 'drained', 'mashed', \
                  'rinsed', 'shredded', 'hulled', 'dry', 'dried', 'deveined', 'packed', 'fresh', 'freshly', 'sliced', 'halved', \
                  'washed', 'sweetened', 'unsweetened', 'extract', 'vegetable', 'large', 'small', 'tiny', 'fine', 'finely', 'beaten',
                  'bite-sized', 'bitesized', 'skinned', 'optional', 'toasted', 'lightly', 'loosely', 'juiced', 'cut', 'necessary', 'desired',
                  'quartered', 'slightly', 'coarsely', 'thin', 'chilled', 'scrambled', 'soft', 'hard', 'short', 'medium', 'long'] \
+ ['white', 'red', 'orange', 'yellow', 'green', 'blue', 'brown', 'black'])
    self.OTHER_PROBLEM_WORDS = set(["room", "temperature", "deep", "frying", 'if', 'to', 'into', 'taste', 'more', 'each', 'about', 'removed'])
    self.ALL_PROBLEM_WORDS = self.MEASURES.union(self.ADJECTIVES, self.OTHER_PROBLEM_WORDS)
    self.nlp = spacy.load("en_core_web_sm", disable=["textcat", "ner", "entity_ruler", "sentencizer", "merge_noun_chunks", "merge_entities", "merge_subtokens"])
    self.use_cache = use_cache
    self.cache = {}

  def deduce_core_ingredient(self, ingr_phrase):
    '''
      ingr_phrase: str
      returns: str

      Attempts to deduce a "canonical form" of the ingredient
      contained in ingr_phrase.
    '''
    words_in_phrase = ingr_phrase.split(" ")
    ingredWord = ""
    if len(words_in_phrase) >= 2 and words_in_phrase[-1].lower() in self.CONTEXT_NEEDERS:
      ingredWord = " ".join(words_in_phrase[-2:])
      # For diagnostic purposes, tell if this has happened?
      # print("DCG: ", words_in_phrase, ingredWord)
    else:
      ingredWord = words_in_phrase[-1]
    return ingredWord
  
  def cleanup_ingredient(self, ingredient):
    '''
      ingredient: str
      returns: List[str]

      Cleans up ingredient string and returns a list of ingredients
      in canonical form.
    '''
    # First clean up by removing unnecessary information
    
    # Uses regex to remove parenthesised portions and numbers,
    # https://www.kite.com/python/answers/how-to-use-regular-expressions-to-remove-text-within-parentheses-in-python
    ingredient = re.sub(r"\([^()]*\)|[0-9]|/|\.|,|'|\"", "", ingredient).lower()

    # Filter out measure words, adjectives, filler words, and other problem words
    ingredient = " ".join(word for word in ingredient.split() if not word.lower() in self.ALL_PROBLEM_WORDS)
    
    ingredient = ingredient.strip()

    if self.use_cache:
      if ingredient in self.cache:
        return self.cache[ingredient]

    # Apply spacy to get the most important noun phrase (hopefully)
    ingr_doc = self.nlp(ingredient)
    noun_phrases = [chunk.text for chunk in ingr_doc.noun_chunks]

    # Try to deduce the ingredient
    num_phrases = len(noun_phrases)
    ingr_list = ""
    if num_phrases == 0:
      # Ooh, here's a toughie.
      # If the ingredient has a ' - ', take out everything after the ' - '.
      # Same with ' and ' and ' or '
      ingredient = ingredient.split(' - ')[0].split(' and ')[0].split(' or ')[0]
      ingr_list = [ingredient] if ingredient else [] # don't add empty-string ingredients
    else:
      ingr_list = [self.deduce_core_ingredient(phrase) for phrase in noun_phrases]

    ingr_list

    if self.use_cache:
      self.cache[ingredient] = ingr_list

    return ingr_list

In [None]:
def flatten(lists):
  return [item for sublist in lists for item in sublist]

ProcessedRecipe = namedtuple('ProcessedRecipe', ['title', 'ingredients', 'instructions'])

IngrProc = IngredientProcessor()

def process_recipe(recipe):
  _, title, ingredients, instructions = re.split("🍴|🥑|🥣", recipe)

  # Process title
  title = title.strip()

  # Process ingredients
  ingredients = ingredients.replace('\n', '').split("•")
  ingredients = [ingredient.strip() for ingredient in ingredients if len(ingredient) > 0]
  cleaned_up_ingrs = flatten([IngrProc.cleanup_ingredient(ingredient) for ingredient in ingredients])

  # Process instructions
  instructions = instructions.replace('\n', '').split("‣")
  cleaned_up_instrs = [instruction.strip() for instruction in instructions if len(instruction) > 0]

  return ProcessedRecipe(title, cleaned_up_ingrs, cleaned_up_instrs)

def extract_title(recipe):
  _, title, _, _ = re.split("🍴|🥑|🥣", recipe)
  title = title.strip()
  return title

def extract_cleaned_ingredients(recipe):
  _, _, ingredients, _ = re.split("🍴|🥑|🥣", recipe)
  ingredients = ingredients.replace('\n', '').split("•")
  ingredients = [ingredient.strip() for ingredient in ingredients if len(ingredient) > 0]
  cleaned_up_ingrs = flatten([IngrProc.cleanup_ingredient(ingredient) for ingredient in ingredients]) 
  return cleaned_up_ingrs

def extract_cleaned_instructions(recipe):
  _, _, _, instructions = re.split("🍴|🥑|🥣", recipe)
  instructions = instructions.replace('\n', '').split("‣")
  cleaned_up_instrs = [instruction.strip() for instruction in instructions if len(instruction) > 0]
  return cleaned_up_instrs 

### Silly testing and timing stuff.

In [None]:
ingr_set2 = set()

for recipe in tqdm(recipes[:200]):
  ingrs = extract_cleaned_ingredients(recipe)
  for ingr in ingrs:
    ingr_set2.add(ingr)

len(ingr_set2)

100%|██████████| 200/200 [00:48<00:00,  4.12it/s]


291

In [None]:
ingr_set2

{'-half',
 'allspice',
 'almond',
 'almonds',
 'anchovy paste',
 'apple',
 'apples',
 'avocados',
 'bacon',
 'baguette',
 'bakers',
 'baking powder',
 'baking soda',
 'bananas',
 'barbecue sauce',
 'barbeque sauce',
 'basil',
 'basil stems removed',
 'bay leaves',
 'beans',
 'beef',
 'beef broth',
 'beer',
 'bell pepper',
 'bell peppers',
 'bell peppers roughly',
 'bite-size',
 'bits',
 'blueberries',
 'boiling',
 'bouillon',
 'brand',
 'bread',
 'bread mix',
 'breast',
 'breasts',
 'broccoli',
 'broth',
 'brownie mix',
 'buffalo',
 'bunch',
 'buns',
 'butter',
 'buttermilk',
 'cabbage',
 'cake mix',
 'capers',
 'carrots',
 'casings',
 'cauliflower',
 'cayenne pepper',
 'celery',
 'cheese',
 'chicken',
 'chicken breast halves',
 'chicken breast meat',
 'chicken broth',
 'chicken soup',
 'chickens',
 'chile peppers',
 'chile-garlic sauce',
 'chiles',
 'chili beans',
 'chili peppers',
 'chili powder',
 'chili sauce',
 'chilies',
 'chipotle pepper',
 'chives',
 'chocolate',
 'chocolate ch

In [None]:
ingr_set3 = set()

for recipe in tqdm(recipes[:10]):
  ingrs = extract_cleaned_ingredients(recipe)
  for ingr in ingrs:
    ingr_set3.add(ingr)

len(ingr_set3)

100%|██████████| 10/10 [00:00<00:00, 2367.92it/s]


52

In [None]:
IngrProc.cache

{'active yeast': ['yeast'],
 'allspice': ['allspice'],
 'almond': ['almond'],
 'almonds blanched and slivered': ['almonds'],
 'anchovy paste': ['anchovy paste'],
 'and cubed butternut squash': ['squash'],
 'and potatoes': ['potatoes'],
 'angel hair pasta': ['pasta'],
 'apples': ['apples'],
 'apples cored and': ['apples'],
 'arborio rice': ['rice'],
 'artichoke hearts and': ['hearts'],
 'assorted food coloring': ['food'],
 'avocados - and': ['avocados'],
 'bacon': ['bacon'],
 'bacon bits': ['bits'],
 'bacon in half': ['bacon', 'half'],
 'bag coleslaw mix': ['coleslaw mix'],
 'bag corn chips such as fritos®': ['corn chips', 'fritos'],
 'bakers semi-sweet baking chocolate': ['bakers', 'chocolate'],
 'baking potatoes': ['potatoes'],
 'baking powder': ['baking powder'],
 'baking soda': ['baking soda'],
 'balsamic syrup': ['syrup'],
 'balsamic vinegar': ['vinegar'],
 'bananas': ['bananas'],
 'basil': ['basil'],
 'basil leaves': ['leaves'],
 'basil stems removed': ['basil stems removed'],
 'b

In [None]:
ingr_set = set()
for recipe in tqdm(recipes):
  title, ingrs, instrs = process_recipe(recipe)
  for ingr in ingrs:
    ingr_set.add(ingr)

len(ingr_set)

100%|██████████| 100/100 [02:02<00:00,  1.22s/it]


215

In [None]:
ingr_set

# Practice

Apply the processing to all of the recipes

In [None]:
tiny_recipes = recipes[300:1000]

In [None]:
tiny_recipes

302     🍴 Slow Cooker Texas Pulled Pork\n\n🥑\n• 1 teas...
303     🍴 Yellow Squash Casserole\n\n🥑\n• 4 cups slice...
304     🍴 Simple Turkey Chili\n\n🥑\n• 1 1/2 teaspoons ...
305     🍴 Creamy Chocolate Frosting\n\n🥑\n• 2 3/4 cups...
306     🍴 Orzo with Parmesan and Basil\n\n🥑\n• 2 table...
                              ...                        
1001    🍴 Chicken Wing Dip\n\n🥑\n• 2 (8 ounce) package...
1002    🍴 Black Bean Brownies\n\n🥑\n• 1 (15.5 ounce) c...
1003    🍴 Salmon with Brown Sugar Glaze\n\n🥑\n• 1/4 cu...
1004    🍴 Strawberry Angel Food Dessert\n\n🥑\n• 1 (10 ...
1005    🍴 Pineapple Chicken Tenders\n\n🥑\n• 1 cup pine...
Length: 700, dtype: object

In [None]:
df_tiny_recipes = pd.DataFrame({'recipe': tiny_recipes})

In [None]:
df_tiny_recipes.apply(lambda row: process_recipe(row['recipe']), axis=1)

KeyboardInterrupt: ignored

In [None]:
shj = _

In [None]:
df_tiny_recipes['title'] = df_tiny_recipes.apply(lambda row: extract_title(row['recipe']), axis=1)

In [None]:
df_tiny_recipes['ingredients'] = df_tiny_recipes.apply(lambda row: extract_cleaned_ingredients(row['recipe']), axis=1)

In [None]:
df_tiny_recipes['instructions'] = df_tiny_recipes.apply(lambda row: extract_cleaned_instructions(row['recipe']), axis=1)

In [None]:
df_tiny_recipes

Unnamed: 0,recipe
302,🍴 Slow Cooker Texas Pulled Pork\n\n🥑\n• 1 teas...
303,🍴 Yellow Squash Casserole\n\n🥑\n• 4 cups slice...
304,🍴 Simple Turkey Chili\n\n🥑\n• 1 1/2 teaspoons ...
305,🍴 Creamy Chocolate Frosting\n\n🥑\n• 2 3/4 cups...
306,🍴 Orzo with Parmesan and Basil\n\n🥑\n• 2 table...
...,...
1001,🍴 Chicken Wing Dip\n\n🥑\n• 2 (8 ounce) package...
1002,🍴 Black Bean Brownies\n\n🥑\n• 1 (15.5 ounce) c...
1003,🍴 Salmon with Brown Sugar Glaze\n\n🥑\n• 1/4 cu...
1004,🍴 Strawberry Angel Food Dessert\n\n🥑\n• 1 (10 ...


In [None]:
dataset_path = os.path.join(CACHE_DIR, 'tiny_cleaned_recipes.pkl')
df_tiny_recipes.to_pickle(dataset_path)

In [None]:
df_tiny_recipes_unpickled = pd.read_pickle(dataset_path)

In [None]:
df_tiny_recipes['ingredients'] = df_tiny_recipes.progress_apply(lambda row: extract_cleaned_ingredients(row['recipe']), axis=1)

100%|██████████| 700/700 [02:32<00:00,  4.59it/s]


# Now actually process all the recipes

Or at least the first 20000

In [None]:
df_recipes = pd.DataFrame({'recipe': recipes})

In [None]:
len(df_recipes)

105789

In [None]:
# Process all of the recipes.

print("Extracting titles")
df_recipes['title'] = df_recipes.progress_apply(lambda row: extract_title(row['recipe']), axis=1)
print("Extracting and processing ingredients")
df_recipes['ingredients'] = df_recipes.progress_apply(lambda row: extract_cleaned_ingredients(row['recipe']), axis=1)
print("Extracting instructions")
df_recipes['instructions'] = df_recipes.progress_apply(lambda row: extract_cleaned_instructions(row['recipe']), axis=1)

df_recipe_dataset_path = os.path.join(CACHE_DIR, 'ingr_cleaned_recipes_20210525.pkl')
df_recipes.to_pickle(df_recipe_dataset_path)

ingr_proc_dataset_path = os.path.join(CACHE_DIR, 'ingr_proc_20210525.pkl')
with open(ingr_proc_dataset_path, "wb") as ingr_proc_fn:
  pickle.dump(IngrProc, ingr_proc_fn)

  4%|▍         | 4206/105789 [00:00<00:02, 42055.43it/s]

Extracting titles


100%|██████████| 105789/105789 [00:02<00:00, 35585.55it/s]
  0%|          | 0/105789 [00:00<?, ?it/s]

Extracting and processing ingredients


100%|██████████| 105789/105789 [4:37:11<00:00,  6.36it/s]
  0%|          | 1/105789 [00:00<3:46:54,  7.77it/s]

Extracting instructions


100%|██████████| 105789/105789 [00:03<00:00, 27168.06it/s]


NameError: ignored

In [None]:
# df_recipe_dataset_path = os.path.join(CACHE_DIR, 'ingr_cleaned_recipes_20210525.pkl')
# df_recipes.to_pickle(df_recipe_dataset_path)

# ingr_proc_dataset_path = os.path.join(CACHE_DIR, 'ingr_proc_20210525.pkl')
# with open(ingr_proc_dataset_path, "wb") as ingr_proc_fn:
#   pickle.dump(IngrProc, ingr_proc_fn)