In [160]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
from nltk import PorterStemmer
import json
import re
from bs4 import BeautifulSoup
import requests
import inflect
import itertools
p = inflect.engine()
%matplotlib inline

stop_words = set(stopwords.words('english'))

In [2]:
# Test dataframe with only yummly data

# cuisines = []
# flavors = []
# ingredients = []
# name = []
# source = []
# for filename in os.listdir('data/metadata27638'):
#     if filename.endswith('.json'):
#         data = json.load(open('data/metadata27638/'+filename))
#         cuisines.append(data['attributes']['cuisine'][0])
#         flavors.append(data['flavors'])
#         ingredientStr = ''
#         for ingred in data['ingredientLines']:
#             ingredientStr = ingredientStr + ingred
#         ingredients.append(ingredientStr)
#         name.append(data['name'])
#         source.append(data['source']['sourceDisplayName'])
        
# df = pd.DataFrame({'cuisine':cuisines,'name':name,'flavors':flavors,'ingredients':ingredients,'source':source})

In [3]:
# Reading in data and checking if name and ingredient lists exist

names = []
ingreds = []

for filename in os.listdir('data/recipe_box'):
    if filename.endswith('.json'):
        data = list(json.load(open('data/recipe_box/'+filename)).values())
        for recipe in data:
            if ('title' not in recipe.keys() or 'ingredients' not in recipe.keys()):
                continue
            names.append(recipe['title'])
            ingreds.append(recipe['ingredients'])

In [130]:
# Creating data frame with dish names and ingredients

df = pd.DataFrame({'name':names,'ingredients':ingreds})

In [131]:
df.shape

(124647, 2)

In [138]:
# Remove empty ingredient lists

df = df[df.astype('str')['ingredients']!='[]']
df.shape

(122971, 2)

In [6]:
# Example of ingredient list

df['ingredients'][10]

['3 to 4 cups chicken stock, preferably homemade, recipe follows',
 '1 quart Roasted Winter Vegetables, recipe follows',
 'Kosher salt and freshly ground black pepper',
 '3 (5-pound) roasting chickens',
 '3 large yellow onions, unpeeled, quartered',
 '6 carrots, unpeeled, halved',
 '4 celery stalks with leaves, cut in thirds',
 '4 parsnips, unpeeled, cut in 1/2, optional',
 '20 sprigs fresh parsley',
 '15 sprigs fresh thyme',
 '20 sprigs fresh dill',
 '1 head garlic, unpeeled, cut in 1/2 crosswise',
 '2 tablespoons kosher salt',
 '2 teaspoons whole black peppercorns',
 '1 pound carrots, peeled',
 '1 pound parsnips, peeled',
 '1 large sweet potato, peeled',
 '1 small butternut squash (about 2 pounds), peeled and seeded',
 '3 tablespoons good olive oil',
 '1 1/2 teaspoons kosher salt',
 '1/2 teaspoon freshly ground black pepper',
 '2 tablespoons chopped fresh flat-leaf parsley']

In [139]:
# Example of ingredient list with ADVERTISEMENT tags

df['ingredients'][100000]

['1/2 small head green cabbage, cored and thinly sliced ADVERTISEMENT',
 '1/2 jicama, sliced into matchsticks ADVERTISEMENT',
 '1 large sweet apple (such as Fuji), sliced into matchsticks ADVERTISEMENT',
 '1/2 cup mayonnaise ADVERTISEMENT',
 '1/4 cup pineapple juice ADVERTISEMENT',
 '1 teaspoon white sugar ADVERTISEMENT',
 'hot sauce to taste ADVERTISEMENT',
 'salt and freshly ground black pepper to taste ADVERTISEMENT',
 '1/4 bunch chopped fresh cilantro ADVERTISEMENT',
 '1/3 ounce toasted corn bits (such as CornNuts ®), crushed ADVERTISEMENT',
 'ADVERTISEMENT']

In [176]:
testlist = df['ingredients'][100000]
testlist

['1/2 small head green cabbage, cored and thinly sliced ADVERTISEMENT',
 '1/2 jicama, sliced into matchsticks ADVERTISEMENT',
 '1 large sweet apple (such as Fuji), sliced into matchsticks ADVERTISEMENT',
 '1/2 cup mayonnaise ADVERTISEMENT',
 '1/4 cup pineapple juice ADVERTISEMENT',
 '1 teaspoon white sugar ADVERTISEMENT',
 'hot sauce to taste ADVERTISEMENT',
 'salt and freshly ground black pepper to taste ADVERTISEMENT',
 '1/4 bunch chopped fresh cilantro ADVERTISEMENT',
 '1/3 ounce toasted corn bits (such as CornNuts ®), crushed ADVERTISEMENT',
 'ADVERTISEMENT']

In [177]:
# Scraping corpus of words related to measurements

page = requests.get('https://www.enchantedlearning.com/wordlist/measurement.shtml')
soup = BeautifulSoup(page.content, "html.parser")
measure_corpus = [tag.text for tag in soup.find_all('div',attrs={'class':'wordlist-item'})]
measure_corpus = measure_corpus + [text+'s' for text in measure_corpus] + ['taste','strip', 'strips', 'package' + 'packages' + 'satchet' + 'satchets' + 'sprigs','head','bunch','small','large','big','medium']

In [188]:
def clean_recipe(ingred_list):
    cleanedtext = []
    for ingred in ingred_list:
        # Ignore extra information
        matchtext = re.sub(r' \([^)]*\)', '', ingred)
        
        # Obtain all before first comma
        if re.compile('^(.+?)(?=,)').search(matchtext) is not None:
            matchtext = re.compile('^(.+?)(?=,)').search(matchtext).group(1)
            matchtext = matchtext.replace(' ADVERTISEMENT','')
            
        # Ignore advertisement tag
        elif (ingred == 'ADVERTISEMENT'):
            continue
        
        # If all text, make sure no advertisement tag
        else:
            matchtext = matchtext.replace(' ADVERTISEMENT','')
        
        # Tokenize ingredient list
        tokenized = word_tokenize(matchtext)
        
        # Remove words likely to be stop words or measurements
        removed_stop = [w for w in tokenized if not w in measure_corpus]
        removed_stop = [w for w in removed_stop if not w in stop_words]
        
        # Filter adjectives and nouns
        ingred_words = lambda pos: pos[:2] in ['JJ','NN','NNS']
        ingreds = [word.lower() for (word, pos) in pos_tag(removed_stop) if ingred_words(pos)]
        
        # Convert to singular
        ingreds = [p.singular_noun(word) if p.singular_noun(word) else word for word in ingreds]
        
        # Remove common ingredients 
        common = ['water','salt','pepper']
        cleanedtext.append(ingreds)
        cleanedtext = [[ing for ing in ingreds if not any(word in common for word in ingreds)] for ingreds in cleanedtext]
        
        # Remove additional descriptors for long ingredient names
        cleanedtext = [ingreds[-2:] if len(ingreds) > 2 else ingreds for ingreds in cleanedtext]
   
    return [(' ').join(item) for item in cleanedtext if len(item)>0]
        
#     return cleanedtext

# test_lst = [['unsalted', 'butter'],
#  ['all-purpose', 'flour'],
#  ['whole', 'milk'],
#  ['monterey', 'jack', 'cheese'],
#  ['poblano', 'chile'],
#  ['salt', 'black', 'pepper'],
#  ['cilantro', 'leaf'],
#  ['dijon', 'mustard'],
#  ['whole-grain', 'mustard'],
#  ['honey'],
#  ['warm', 'water'],
#  ['light', 'brown', 'muscovado', 'sugar'],
#  ['package', 'active', 'dry', 'yeast'],
#  ['unsalted', 'butter'],
#  ['kosher', 'salt'],
#  ['all-purpose', 'flour'],
#  ['vegetable', 'oil'],
#  ['water'],
#  ['baking', 'soda'],
#  ['whole', 'egg'],
#  ['coarse', 'sea', 'salt']]
# common = ['water','salt','pepper']
# common_filter = lambda lst: any(word in common for word in lst)

# test2 = [[ing for ing in ingreds if not any(word in common for word in ingreds)] for ingreds in test_lst]
# [(' ').join(item) for item in test2 if len(item)>0]

# [ingred for ingred in test_lst if not any(word in common for word in test_lst)]

In [192]:
clean_recipe(df['ingredients'][3420])

['cooking spray',
 'bread crumb',
 'olive oil',
 'turkey chicken',
 'pasta',
 'parmesan',
 'white cheddar',
 'cherry tomato',
 'broccoli',
 'asparagu']

In [180]:
#regex test
# re.compile('(?<=\d )(?!.*\d)(.+?)(?=[^a-zA-Z ])').search('1 large sweet apple (such as Fuji), sliced into matchsticks ADVERTISEMENT').group(1)

In [182]:
df['ingredients'][3440]

['3 tablespoons finely chopped red onions',
 '2 tablespoons fresh cilantro, finely chopped',
 '1 tablespoon olive oil',
 '3 plum tomatoes, seeded and finely diced',
 'One 15-ounce can black beans, drained, rinsed well anddrained again',
 '1 serrano chile, finely diced',
 'Juice of 1 lime',
 'Salt and freshly ground black pepper',
 '1 tablespoon fresh lime juice',
 '3 large egg yolks, lightly beaten',
 '1 1/2 sticks unsalted butter, melted until foamy',
 "1/4 cup to 1/2 cup your favorite steak sauce, such as Bobby Flay's Mesa Steak Sauce",
 '1 teaspoon kosher salt',
 '1/4 teaspoon freshly ground black pepper',
 '2 cloves garlic, smashed to a paste',
 '1 stick unsalted butter, softened',
 'Salt and freshly ground black pepper',
 '4 slices pain de mie, sliced 1-inch thick',
 'One 1-pound boneless ribeye, about 1 1/2 inches thick, cut into eight 1/2-inch-thick slices',
 "3 tablespoons spice rub, such as Bobby Flay's Steak Rub for Beef and Pork",
 'Kosher salt and freshly ground black peppe

In [159]:
crossjoin = [1,2,3,4,5]

In [163]:
list(itertools.combinations(['unsalted butter',
 'creole seasoning',
 'lemon',
 'chive',
 'lemon',
 'compound butter',
 'fresh skinles',
 'creole seasoning',
 'whole chive'],2))

[('unsalted butter', 'creole seasoning'),
 ('unsalted butter', 'lemon'),
 ('unsalted butter', 'chive'),
 ('unsalted butter', 'lemon'),
 ('unsalted butter', 'compound butter'),
 ('unsalted butter', 'fresh skinles'),
 ('unsalted butter', 'creole seasoning'),
 ('unsalted butter', 'whole chive'),
 ('creole seasoning', 'lemon'),
 ('creole seasoning', 'chive'),
 ('creole seasoning', 'lemon'),
 ('creole seasoning', 'compound butter'),
 ('creole seasoning', 'fresh skinles'),
 ('creole seasoning', 'creole seasoning'),
 ('creole seasoning', 'whole chive'),
 ('lemon', 'chive'),
 ('lemon', 'lemon'),
 ('lemon', 'compound butter'),
 ('lemon', 'fresh skinles'),
 ('lemon', 'creole seasoning'),
 ('lemon', 'whole chive'),
 ('chive', 'lemon'),
 ('chive', 'compound butter'),
 ('chive', 'fresh skinles'),
 ('chive', 'creole seasoning'),
 ('chive', 'whole chive'),
 ('lemon', 'compound butter'),
 ('lemon', 'fresh skinles'),
 ('lemon', 'creole seasoning'),
 ('lemon', 'whole chive'),
 ('compound butter', 'fresh