In [248]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
from nltk import PorterStemmer
import json
import re
from bs4 import BeautifulSoup
import requests
import inflect
import itertools
from tqdm import tqdm
p = inflect.engine()
%matplotlib inline
pd.options.display.max_rows = 999

stop_words = set(stopwords.words('english'))

In [2]:
# Test dataframe with only yummly data

# cuisines = []
# flavors = []
# ingredients = []
# name = []
# source = []
# for filename in os.listdir('data/metadata27638'):
#     if filename.endswith('.json'):
#         data = json.load(open('data/metadata27638/'+filename))
#         cuisines.append(data['attributes']['cuisine'][0])
#         flavors.append(data['flavors'])
#         ingredientStr = ''
#         for ingred in data['ingredientLines']:
#             ingredientStr = ingredientStr + ingred
#         ingredients.append(ingredientStr)
#         name.append(data['name'])
#         source.append(data['source']['sourceDisplayName'])
        
# df = pd.DataFrame({'cuisine':cuisines,'name':name,'flavors':flavors,'ingredients':ingredients,'source':source})

In [3]:
# Reading in data and checking if name and ingredient lists exist

names = []
ingreds = []

for filename in os.listdir('data/recipe_box'):
    if filename.endswith('.json'):
        data = list(json.load(open('data/recipe_box/'+filename)).values())
        for recipe in data:
            if ('title' not in recipe.keys() or 'ingredients' not in recipe.keys()):
                continue
            names.append(recipe['title'])
            ingreds.append(recipe['ingredients'])

In [130]:
# Creating data frame with dish names and ingredients

df = pd.DataFrame({'name':names,'ingredients':ingreds})

In [131]:
df.shape

(124647, 2)

In [138]:
# Remove empty ingredient lists

df = df[df.astype('str')['ingredients']!='[]']
df.shape

(122971, 2)

In [6]:
# Example of ingredient list

df['ingredients'][10]

['3 to 4 cups chicken stock, preferably homemade, recipe follows',
 '1 quart Roasted Winter Vegetables, recipe follows',
 'Kosher salt and freshly ground black pepper',
 '3 (5-pound) roasting chickens',
 '3 large yellow onions, unpeeled, quartered',
 '6 carrots, unpeeled, halved',
 '4 celery stalks with leaves, cut in thirds',
 '4 parsnips, unpeeled, cut in 1/2, optional',
 '20 sprigs fresh parsley',
 '15 sprigs fresh thyme',
 '20 sprigs fresh dill',
 '1 head garlic, unpeeled, cut in 1/2 crosswise',
 '2 tablespoons kosher salt',
 '2 teaspoons whole black peppercorns',
 '1 pound carrots, peeled',
 '1 pound parsnips, peeled',
 '1 large sweet potato, peeled',
 '1 small butternut squash (about 2 pounds), peeled and seeded',
 '3 tablespoons good olive oil',
 '1 1/2 teaspoons kosher salt',
 '1/2 teaspoon freshly ground black pepper',
 '2 tablespoons chopped fresh flat-leaf parsley']

In [139]:
# Example of ingredient list with ADVERTISEMENT tags

df['ingredients'][100000]

['1/2 small head green cabbage, cored and thinly sliced ADVERTISEMENT',
 '1/2 jicama, sliced into matchsticks ADVERTISEMENT',
 '1 large sweet apple (such as Fuji), sliced into matchsticks ADVERTISEMENT',
 '1/2 cup mayonnaise ADVERTISEMENT',
 '1/4 cup pineapple juice ADVERTISEMENT',
 '1 teaspoon white sugar ADVERTISEMENT',
 'hot sauce to taste ADVERTISEMENT',
 'salt and freshly ground black pepper to taste ADVERTISEMENT',
 '1/4 bunch chopped fresh cilantro ADVERTISEMENT',
 '1/3 ounce toasted corn bits (such as CornNuts ®), crushed ADVERTISEMENT',
 'ADVERTISEMENT']

In [176]:
testlist = df['ingredients'][100000]
testlist

['1/2 small head green cabbage, cored and thinly sliced ADVERTISEMENT',
 '1/2 jicama, sliced into matchsticks ADVERTISEMENT',
 '1 large sweet apple (such as Fuji), sliced into matchsticks ADVERTISEMENT',
 '1/2 cup mayonnaise ADVERTISEMENT',
 '1/4 cup pineapple juice ADVERTISEMENT',
 '1 teaspoon white sugar ADVERTISEMENT',
 'hot sauce to taste ADVERTISEMENT',
 'salt and freshly ground black pepper to taste ADVERTISEMENT',
 '1/4 bunch chopped fresh cilantro ADVERTISEMENT',
 '1/3 ounce toasted corn bits (such as CornNuts ®), crushed ADVERTISEMENT',
 'ADVERTISEMENT']

In [177]:
# Scraping corpus of words related to measurements

page = requests.get('https://www.enchantedlearning.com/wordlist/measurement.shtml')
soup = BeautifulSoup(page.content, "html.parser")
measure_corpus = [tag.text for tag in soup.find_all('div',attrs={'class':'wordlist-item'})]
measure_corpus = measure_corpus + [text+'s' for text in measure_corpus] + ['taste','strip', 'strips', 'package' + 'packages' + 'satchet' + 'satchets' + 'sprigs','head','bunch','small','large','big','medium']

In [188]:
def clean_recipe(ingred_list):
    cleanedtext = []
    for ingred in ingred_list:
        # Ignore extra information
        matchtext = re.sub(r' \([^)]*\)', '', ingred)
        
        # Obtain all before first comma
        if re.compile('^(.+?)(?=,)').search(matchtext) is not None:
            matchtext = re.compile('^(.+?)(?=,)').search(matchtext).group(1)
            matchtext = matchtext.replace(' ADVERTISEMENT','')
            
        # Ignore advertisement tag
        elif (ingred == 'ADVERTISEMENT'):
            continue
        
        # If all text, make sure no advertisement tag
        else:
            matchtext = matchtext.replace(' ADVERTISEMENT','')
        
        # Tokenize ingredient list
        tokenized = word_tokenize(matchtext)
        
        # Remove words likely to be stop words or measurements
        removed_stop = [w for w in tokenized if not w in measure_corpus]
        removed_stop = [w for w in removed_stop if not w in stop_words]
        
        # Filter adjectives and nouns
        ingred_words = lambda pos: pos[:2] in ['JJ','NN','NNS']
        ingreds = [word.lower() for (word, pos) in pos_tag(removed_stop) if ingred_words(pos)]
        
        # Convert to singular
        ingreds = [p.singular_noun(word) if p.singular_noun(word) else word for word in ingreds]
        
        # Remove common ingredients 
        common = ['water','salt','pepper']
        cleanedtext.append(ingreds)
        cleanedtext = [[ing for ing in ingreds if not any(word in common for word in ingreds)] for ingreds in cleanedtext]
        
        # Remove additional descriptors for long ingredient names
        cleanedtext = [ingreds[-2:] if len(ingreds) > 2 else ingreds for ingreds in cleanedtext]
   
    return [(' ').join(item) for item in cleanedtext if len(item)>0]

In [192]:
clean_recipe(df['ingredients'][3420])

['cooking spray',
 'bread crumb',
 'olive oil',
 'turkey chicken',
 'pasta',
 'parmesan',
 'white cheddar',
 'cherry tomato',
 'broccoli',
 'asparagu']

In [180]:
#regex test
# re.compile('(?<=\d )(?!.*\d)(.+?)(?=[^a-zA-Z ])').search('1 large sweet apple (such as Fuji), sliced into matchsticks ADVERTISEMENT').group(1)

In [236]:
df['ingredients'][2]

['1 1/2 cups dried black beans, picked over and rinsed',
 '8 cups water, plus 1/4 cup',
 '2 cloves garlic',
 '3 tablespoons vegetable oil',
 '1 large green bell pepper, finely chopped, reserving about 1 teaspoon for garnish',
 '1 small red onion, finely chopped, reserving about 1 teaspoon for garnish',
 '2 teaspoons ground cumin',
 '2 tablespoons cider vinegar',
 'Salt and pepper',
 '1 plum tomato, finely chopped',
 '1 small red chile, finely chopped',
 'Fresh cilantro leaves, for garnish',
 'Dollop sour cream, for garnish',
 'Serving Suggestion: Tortilla chips']

In [195]:
# Clean recipes across dataframe

df['ingreds_list'] = df['ingredients'].apply(clean_recipe)

In [198]:
df['ingreds_list'].head(20)

0     [celery, green onion, parsley, crabmeat, crack...
1     [skirt steak, applewood bacon, red onion, jala...
2     [black bean, clove garlic, vegetable oil, red ...
3     [ground chuck, tomato sauce, egg noodle, sour ...
4     [rice, 1/2-cup quinoa, white tuna, seaweed kel...
5     [olive oil, italian eggplant, hummu hummu, gra...
6     [saffron thread, sugar, orange zest, carrot, u...
7     [hot-dog bun, unsalted butter, lobster lobster...
8     [olive oil, onion, pine nut, white rice, tomat...
9     [cauliflower, clove garlic, unsalted butter, b...
10    [chicken stock, winter vegetable, roasting chi...
11    [chili sauce, rice vinegar, fish sauce, fresh ...
12                        [cheese product, green chily]
13    [london broil, olive oil, worcestershire sauce...
14    [duck, garlic clove, cinnamon stick, fat lard,...
15    [purpose flour, cornmeal, baking powder, sugar...
16    [boneles, egg, bread crumb, deli ham, wine sau...
17    [tortilla chip, neely bbq, pork, bbq sauce

In [202]:
list_ingreds = df['ingreds_list']

In [220]:
combinations = []

for lists in tqdm(list_ingreds):
    combinations = combinations + list(itertools.combinations(lists,2))
    
combinations_df = pd.DataFrame(combinations)

100%|██████████| 122971/122971 [11:20:42<00:00,  3.01it/s]     


In [387]:
# List of top 100 ingredients on both sides of pairings

combinations_df.columns = ['ingred1','ingred2']
top_ingred1 = combinations_df['ingred1'].value_counts()[:100]
top_ingred2 = combinations_df['ingred2'].value_counts()[:100]

In [388]:
# Create new list of ranked ingredients by combining both sides of pairing list

top_ingreds = pd.DataFrame(top_ingred1).join(pd.DataFrame(top_ingred2),how='outer').fillna(0)
top_ingreds['total'] = top_ingreds['ingred1'] + top_ingreds['ingred2']
top_ingreds = top_ingreds.sort_values(by="total", ascending=False)['total']
top_ingreds = top_ingreds.reset_index() 
top_ingreds.columns = ['ingredient','total']

In [389]:
top_ingreds_list = list(top_ingreds['ingredient'])

In [421]:
# Filter for combinations with top ingredients

filtered_combinations = []

for x in tqdm(combinations_df.itertuples()):
    if x[1] in top_ingreds_list and x[2] in top_ingreds_list:
        filtered_combinations.append(x[1:3])

filtered_combinations_df = pd.DataFrame(filtered_combinations, columns=['ingred1','ingred2'])

5839145it [00:13, 420861.00it/s]


In [423]:
filtered_combinations_df.shape

(1803444, 2)

In [444]:
# Get count of recipes each top ingredient appears in
# Later determines radius/color of network

top_ingreds_recipes = dict.fromkeys(top_ingreds_list, 0)

for ingred in tqdm(top_ingreds_list):
    for lst in df['ingreds_list']:
        if ingred in lst:
            top_ingreds_recipes[ingred] += 1

top_ingreds_recipes

100%|██████████| 118/118 [00:04<00:00, 24.17it/s]


{'all-purpose flour': 19562,
 'almond': 2182,
 'baking powder': 7949,
 'baking soda': 6250,
 'balsamic vinegar': 2800,
 'basil leaf': 2945,
 'bay leaf': 4656,
 'black peppercorn': 1633,
 'bread crumb': 2988,
 'brown sugar': 11073,
 'butter': 19296,
 'buttermilk': 2460,
 'cake flour': 1000,
 'canola oil': 3871,
 'carrot': 6529,
 'cheddar cheese': 3096,
 'chicken broth': 5640,
 'chicken stock': 3494,
 'chili powder': 2840,
 'chocolate chip': 2696,
 'chopped onion': 2469,
 'cider vinegar': 2731,
 'cilantro': 1410,
 'cilantro leaf': 2367,
 'clove garlic': 17952,
 'cocoa powder': 2536,
 'confectioner sugar': 4060,
 'cooking spray': 2424,
 'cornstarch': 3439,
 'cream': 1796,
 'cream cheese': 3663,
 'dijon mustard': 4194,
 'dried oregano': 2930,
 'dry yeast': 1488,
 'egg': 24435,
 'egg white': 3356,
 'egg yolk': 4649,
 'flat-leaf parsley': 2354,
 'flour': 2729,
 'fresh basil': 1981,
 'fresh chive': 1929,
 'fresh cilantro': 3376,
 'fresh ginger': 2706,
 'fresh parsley': 4060,
 'fresh rosemary'

In [457]:
top_ingreds_recipes_df = pd.DataFrame(list(zip(top_ingreds_recipes.keys(),top_ingreds_recipes.values())), columns = ['ingredient','recipes'])

top_ingreds = pd.merge(top_ingreds, top_ingreds_recipes_df)

In [462]:
top_ingreds['total'] = top_ingreds['total'].astype('int')

In [464]:
top_ingreds.head()

Unnamed: 0,ingredient,total,recipes
0,olive oil,377461,32429
1,sugar,288310,21456
2,egg,257308,24435
3,all-purpose flour,218468,19562
4,clove garlic,216540,17952


### Categories to rename or combine

sugar + white sugar
clove garlic + garlic clove + garlic
butter + unsalted butter + stick butter
chicken broth + chicken stock
parmesan cheese + parmesan
onion + yellow onion + chopped onion
stalk celery -> celery
remove 'ground'
all-purpose flour + purpose flour
fresh ginger + ginger
slice bacon -> bacon
bay leaf
cilantro leaf + fresh cilantro
parsley leaf + flat-leaf parsley + fresh parsley
basil leaf + fresh basil
mint leaf
thyme leaf + fresh thyme

In [468]:
# Function to combine rows with similar names into single row with aggregated column

def combine_row(df, column1, aggcolumn1, aggcolumn2, commonvals, newname):
    tot1 = 0
    tot2 = 0
    df = df.append(pd.DataFrame([[newname,0,0]], columns=df.columns),ignore_index=True)
    for val in commonvals:
        tot1 = tot1 + df[aggcolumn1][df[column1] == val].values
        tot2 = tot2 + df[aggcolumn2][df[column1] == val].values
        df = df[df[column1] != val]
    df.loc[df[column1] == newname, aggcolumn1] = tot1
    df.loc[df[column1] == newname, aggcolumn2] = tot2
    return df

In [469]:
# Combine similar ingredients and remove unecesssary text

top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['clove garlic','garlic clove','garlic'], 'garlic_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['sugar', 'white sugar'], 'sugar_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['butter','unsalted butter','stick butter'], 'butter_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['chicken broth', 'chicken stock'], 'chicken stock_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['onion', 'yellow onion', 'chopped onion'], 'onion_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['parmesan cheese', 'parmesan'], 'parmesan_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['all-purpose flour','purpose flour'], 'all-purpose flour_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['fresh ginger','ginger'], 'ginger_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['cilantro leaf','fresh cilantro'], 'cilantro')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['basil leaf','fresh basil'], 'basil')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['thyme leaf','fresh thyme'], 'thyme')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['parsley leaf', 'flat-leaf parsley', 'fresh parsley'], 'parsley')

top_ingreds['ingredient'] = top_ingreds['ingredient'].str.replace('stalk ', '')
top_ingreds['ingredient'] = top_ingreds['ingredient'].str.replace('ground ', '')
top_ingreds['ingredient'] = top_ingreds['ingredient'].str.replace('slice ', '')
top_ingreds['ingredient'] = top_ingreds['ingredient'].str.replace('_', '')

top_ingreds = top_ingreds.sort_values(by="total", ascending=False).reset_index().drop('index',axis=1)
top_ingreds

Unnamed: 0,ingredient,total,recipes
0,butter,421929,36975
1,garlic,384999,32539
2,sugar,383695,31492
3,olive oil,377461,32429
4,egg,257308,24435
5,all-purpose flour,232159,21384
6,onion,216398,20171
7,vanilla extract,142113,11957
8,vegetable oil,126008,11680
9,brown sugar,124889,11073


In [470]:
combinations_df.to_csv('all_combinations.csv')
top_ingreds.to_csv('cleaned_top_ingreds.csv')
filtered_combinations_df.to_csv('filtered_combinations.csv')

In [473]:
/

# Count number of overlapping edges
filtered_combinations_df.head(50)

Unnamed: 0,ingred1,ingred2
0,green onion,parsley
1,green onion,hot sauce
2,green onion,heavy cream
3,green onion,butter
4,parsley,hot sauce
5,parsley,heavy cream
6,parsley,butter
7,hot sauce,heavy cream
8,hot sauce,butter
9,heavy cream,butter
