# What's in this notebook?
Preparing data for analysis - extracting the ingredients & including as features, getting features from the directions (such as oven temperature), etc.

In [223]:
import pickle
import pandas as pd
import numpy as np
import re
from tqdm import tqdm_notebook
import seaborn as sns

In [7]:
with open('../data/dataframe.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    data = pickle.load(f)

In [8]:
data

Unnamed: 0,directions,ingredients,link,nutrition,rating
0,[Preheat oven to 350 degrees F (175 degrees C)...,"[1 (9 inch) pie shell, 1 (8 ounce) package cre...",https://www.allrecipes.com/recipe/16052/paradi...,"{'calories': '361 calories;', 'fatContent': '1...",4.4
1,[Preheat oven to 425 degrees F (220 degrees C....,[1 recipe pastry for a 9 inch single crust pie...,https://www.allrecipes.com/recipe/12141/moms-p...,"{'calories': '345 calories;', 'fatContent': '1...",4.5
2,[Preheat oven to 425 degrees F (220 degrees C)...,"[1 (15 ounce) can pumpkin puree, 3 egg yolks, ...",https://www.allrecipes.com/recipe/230132/chef-...,"{'calories': '320 calories;', 'fatContent': '1...",4.8
3,[Preheat oven to 350 degrees F (175 degrees C)...,"[2 prepared pie crusts, 3 eggs, divided, 1 (8 ...",https://www.allrecipes.com/recipe/219025/mini-...,"{'calories': '138 calories;', 'fatContent': '8...",4.8
4,[Line one 9 inch pie pan with whole gingersnap...,"[1 cup pumpkin puree, 3 eggs, 1/2 cup white su...",https://www.allrecipes.com/recipe/9121/dads-pu...,"{'calories': '343 calories;', 'fatContent': '1...",4.4
5,[Preheat oven to 375 degrees F (190 degrees C)...,[1 recipe pastry for a 9 inch single crust pie...,https://www.allrecipes.com/recipe/13504/gourme...,"{'calories': '310 calories;', 'fatContent': '1...",4.0
6,[Preheat oven to 375 degrees F (190 degrees C)...,"[1 (9 inch) unbaked pie crust, 3/4 cup toffee ...",https://www.allrecipes.com/recipe/24198/pumpki...,"{'calories': '504 calories;', 'fatContent': '2...",4.5
7,[Preheat oven to 350 degrees F (175 degrees C)...,"[1 (9 inch) pie shell, 1 (8 ounce) package cre...",https://www.allrecipes.com/recipe/15947/old-fa...,"{'calories': '430 calories;', 'fatContent': '2...",4.6
8,"[Preheat oven to 425 degrees F. Whisk pumpkin,...","[1 (15 ounce) can pumpkin, 1 (14 ounce) can EA...",https://www.allrecipes.com/recipe/23439/perfec...,"{'calories': '379 calories;', 'fatContent': '1...",4.7
9,"[Preheat oven to 425 degrees F., Combine sugar...","[1 (9 inch) unbaked deep dish pie crust, 3/4 c...",https://www.allrecipes.com/recipe/22755/libbys...,"{'calories': '283 calories;', 'fatContent': '1...",4.8


In [92]:
# ingredients = set()
# for ingredient_list in data.ingredients:
#     ingredients.add
    
ingredients = [ingredient for ingredient_list in data.ingredients for ingredient in ingredient_list]

In [14]:
ingredients

['1 (9 inch) pie shell',
 '1 (8 ounce) package cream cheese, softened',
 '1/4 cup white sugar',
 '1/2 teaspoon vanilla extract',
 '1 egg, beaten',
 '1 1/4 cups pumpkin puree',
 '1 cup evaporated milk',
 '1/2 cup white sugar',
 '2 eggs, beaten',
 '1 teaspoon ground cinnamon',
 '1/4 teaspoon ground ginger',
 '1/4 teaspoon ground nutmeg',
 '1 pinch salt',
 '1/4 cup maple syrup',
 '1 recipe pastry for a 9 inch single crust pie',
 '3 eggs',
 '1 egg yolk',
 '1/2 cup white sugar',
 '1/2 cup packed brown sugar',
 '1 teaspoon salt',
 '1/2 teaspoon ground cinnamon',
 '1/2 teaspoon ground nutmeg',
 '1/2 teaspoon ground ginger',
 '1/4 teaspoon ground cloves',
 '1 1/2 cups milk',
 '1/2 cup heavy whipping cream',
 '2 cups pumpkin puree',
 '1 (15 ounce) can pumpkin puree',
 '3 egg yolks',
 '1 large egg',
 '1 (14 ounce) can sweetened condensed milk',
 '1 teaspoon ground cinnamon',
 '1/2 teaspoon ground ginger',
 '1/2 teaspoon fine salt',
 '1/4 teaspoon freshly grated nutmeg',
 '1/8 teaspoon Chinese 5-

Great! Now we have a whole list of ingredients from every single recipe! The next few things I want to do is get rid of some weird things, like how the ingredients are sometimes separated into different parts. For example, the scraped ingredients sometimes list a component of the recipe, like "walnut topping", as an ingredient, when it's really a title of a group of ingredients, so we'll get rid of those. Also, I want to get rid of the measurements and just get the ingredients themselves. 

To get rid of the measurements, I'll create a list of measurements, like 'teaspoon', I'll remove numbers, and I'll remove things in parenthases (which generally denote measurements like how many ounces should be in a can or the diameter of a pie crust). 

In [229]:
# will need to check for plural versions as well
measurements = ['teaspoon', 'cup', 'tablespoon', 'gram', 'ounce', 'pinch', 
                'canned', 'can', 'package', 'size', 'container', 'inch',
               'fluid ounce', 'packets', 'pack', 'jar', 'granuled', 'large',
               'sheets', 'recipe', 'medium', 'pint', 'solid', 'tub', 'slice',
               'envelope', 'deep dish', 'as needed'] 
preparations = ['packed', 'chopped', 'diced', 'softened', 'beaten', 
                'lightly', 'cold', 'room temperature', 'thawed', 'frozen',
               'unbaked', 'baked', 'finely', 'melted', 'cubed', 'seeded',
               'halved', 'divided', 'mashed', 'cooked', 'warmed', 'crushed',
               'prepared']
stopwords = ['for', 'at', 'the', ' a ', 'of', ' s ', ' ed', 'and']
brands = ['karo', 'spice island', 'carnation', 'eagle', 'planters', 
          'mccormick', 'jell o']

new_ingredients  = []

for ingredient in tqdm_notebook(ingredients):
    # get rid of ingredients that aren't actually ingredients, but titles
    if ':' in ingredient:
        pass
    else:
        # get rid of words and other characters specified above
        for punc_or_num in "',./1234567890%":
            ingredient = ingredient.replace(punc_or_num, '')
            
        # just for '-' replace with a space
        ingredient = ingredient.replace('-', ' ')
        
        for measurement in measurements:
            ingredient = re.sub(f"\\b{measurement}s?\\b", '', ingredient)

        for preparation in preparations:
            ingredient = re.sub(f"\\b{preparation}s?\\b", '', ingredient)

        for word in stopwords:
            ingredient = re.sub(f"\\b{word}s?\\b", '', ingredient)

        for brand in brands:
            ingredient = re.sub(f"^{brand}|\\b{brand}s?\\b", '', ingredient.lower())
        
        
        # get rid of stuff in the parentheticals
        ingredient = re.sub('\(.*\)', '', ingredient)
        
        # get rid of leading spaces or endings
        ingredient = re.sub('^ed', '', ingredient)
        ingredient = re.sub('^s | s\s?$', '', ingredient)
        ingredient = re.sub('^\s+|\s+$', '', ingredient)
        
        # replace egg with eggs for consistency
        ingredient = re.sub('egg', 'eggs', ingredient)
    
        if ingredient != '':
            new_ingredients.append(ingredient)
        

HBox(children=(IntProgress(value=0, max=1487), HTML(value='')))




In [230]:
new_ingredients

['pie shell',
 'cream cheese',
 'white sugar',
 'vanilla extract',
 'eggs',
 'pumpkin puree',
 'evaporated milk',
 'white sugar',
 'eggss',
 'ground cinnamon',
 'ground ginger',
 'ground nutmeg',
 'salt',
 'maple syrup',
 'pastry  a   single crust pie',
 'eggss',
 'eggs yolk',
 'white sugar',
 'brown sugar',
 'salt',
 'ground cinnamon',
 'ground nutmeg',
 'ground ginger',
 'ground cloves',
 'milk',
 'heavy whipping cream',
 'pumpkin puree',
 'pumpkin puree',
 'eggs yolks',
 'eggs',
 'sweetened condensed milk',
 'ground cinnamon',
 'ground ginger',
 'fine salt',
 'freshly grated nutmeg',
 'chinese  spice powder',
 'pie crust',
 'pie crusts',
 'eggss',
 'cream cheese',
 'white sugar',
 'pumpkin',
 'vanilla extract',
 'pumpkin pie spice',
 'pumpkin puree',
 'eggss',
 'white sugar',
 'milk',
 'salt',
 'pumpkin pie spice',
 'butter',
 'unflavored gelatin',
 'water',
 'white sugar',
 'gingersnap cookies',
 'pastry  a   single crust pie',
 'pecans',
 'pumpkin puree',
 'eggs',
 'sweetened cond

In [232]:
#pd.Series(new_ingredients).value_counts()[pd.Series(new_ingredients).value_counts() > 5].plot('bar')