In [248]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
from nltk import PorterStemmer
import json
import re
from bs4 import BeautifulSoup
import requests
import inflect
import itertools
from tqdm import tqdm
p = inflect.engine()
%matplotlib inline
pd.options.display.max_rows = 999

stop_words = set(stopwords.words('english'))

In [2]:
# Test dataframe with only yummly data

# cuisines = []
# flavors = []
# ingredients = []
# name = []
# source = []
# for filename in os.listdir('data/metadata27638'):
#     if filename.endswith('.json'):
#         data = json.load(open('data/metadata27638/'+filename))
#         cuisines.append(data['attributes']['cuisine'][0])
#         flavors.append(data['flavors'])
#         ingredientStr = ''
#         for ingred in data['ingredientLines']:
#             ingredientStr = ingredientStr + ingred
#         ingredients.append(ingredientStr)
#         name.append(data['name'])
#         source.append(data['source']['sourceDisplayName'])
        
# df = pd.DataFrame({'cuisine':cuisines,'name':name,'flavors':flavors,'ingredients':ingredients,'source':source})

In [3]:
# Reading in data and checking if name and ingredient lists exist

names = []
ingreds = []

for filename in os.listdir('data/recipe_box'):
    if filename.endswith('.json'):
        data = list(json.load(open('data/recipe_box/'+filename)).values())
        for recipe in data:
            if ('title' not in recipe.keys() or 'ingredients' not in recipe.keys()):
                continue
            names.append(recipe['title'])
            ingreds.append(recipe['ingredients'])

In [130]:
# Creating data frame with dish names and ingredients

df = pd.DataFrame({'name':names,'ingredients':ingreds})

In [131]:
df.shape

(124647, 2)

In [138]:
# Remove empty ingredient lists

df = df[df.astype('str')['ingredients']!='[]']
df.shape

(122971, 2)

In [720]:
# Example of ingredient list

df['ingredients'][100435]

['2 tablespoons extra-virgin olive oil ADVERTISEMENT',
 '2 yellow onions, diced ADVERTISEMENT',
 '5 cloves garlic, minced ADVERTISEMENT',
 '1 small sugar pumpkin, peeled, seeded, cut into 1/2-inch dice ADVERTISEMENT',
 '4 cups beef broth ADVERTISEMENT',
 '1 (16 ounce) can diced tomatoes, undrained ADVERTISEMENT',
 '1/2 teaspoon salt ADVERTISEMENT',
 '1 teaspoon ground black pepper ADVERTISEMENT',
 '1 bunch lacinato kale, stems removed, chopped ADVERTISEMENT',
 '1 pound cubed cooked ham ADVERTISEMENT',
 '1 (15 ounce) can black beans, rinsed and drained ADVERTISEMENT',
 '2 tablespoons sherry vinegar ADVERTISEMENT',
 '1 tablespoon thinly sliced sorrel ADVERTISEMENT',
 'ADVERTISEMENT']

In [719]:
clean_recipe(df['ingredients'][100435])

['olive oil',
 'yellow onion',
 'clove garlic',
 'sugar pumpkin',
 'beef broth',
 'tomato',
 'lacinato kale',
 'cubed ham',
 'black bean',
 'sherry vinegar',
 'sorrel']

In [139]:
# Example of ingredient list with ADVERTISEMENT tags

df['ingredients'][100000]

['1/2 small head green cabbage, cored and thinly sliced ADVERTISEMENT',
 '1/2 jicama, sliced into matchsticks ADVERTISEMENT',
 '1 large sweet apple (such as Fuji), sliced into matchsticks ADVERTISEMENT',
 '1/2 cup mayonnaise ADVERTISEMENT',
 '1/4 cup pineapple juice ADVERTISEMENT',
 '1 teaspoon white sugar ADVERTISEMENT',
 'hot sauce to taste ADVERTISEMENT',
 'salt and freshly ground black pepper to taste ADVERTISEMENT',
 '1/4 bunch chopped fresh cilantro ADVERTISEMENT',
 '1/3 ounce toasted corn bits (such as CornNuts ®), crushed ADVERTISEMENT',
 'ADVERTISEMENT']

In [176]:
testlist = df['ingredients'][100000]
testlist

['1/2 small head green cabbage, cored and thinly sliced ADVERTISEMENT',
 '1/2 jicama, sliced into matchsticks ADVERTISEMENT',
 '1 large sweet apple (such as Fuji), sliced into matchsticks ADVERTISEMENT',
 '1/2 cup mayonnaise ADVERTISEMENT',
 '1/4 cup pineapple juice ADVERTISEMENT',
 '1 teaspoon white sugar ADVERTISEMENT',
 'hot sauce to taste ADVERTISEMENT',
 'salt and freshly ground black pepper to taste ADVERTISEMENT',
 '1/4 bunch chopped fresh cilantro ADVERTISEMENT',
 '1/3 ounce toasted corn bits (such as CornNuts ®), crushed ADVERTISEMENT',
 'ADVERTISEMENT']

In [177]:
# Scraping corpus of words related to measurements

page = requests.get('https://www.enchantedlearning.com/wordlist/measurement.shtml')
soup = BeautifulSoup(page.content, "html.parser")
measure_corpus = [tag.text for tag in soup.find_all('div',attrs={'class':'wordlist-item'})]
measure_corpus = measure_corpus + [text+'s' for text in measure_corpus] + ['taste','strip', 'strips', 'package' + 'packages' + 'satchet' + 'satchets' + 'sprigs','head','bunch','small','large','big','medium']

In [188]:
def clean_recipe(ingred_list):
    cleanedtext = []
    for ingred in ingred_list:
        # Ignore extra information
        matchtext = re.sub(r' \([^)]*\)', '', ingred)
        
        # Obtain all before first comma
        if re.compile('^(.+?)(?=,)').search(matchtext) is not None:
            matchtext = re.compile('^(.+?)(?=,)').search(matchtext).group(1)
            matchtext = matchtext.replace(' ADVERTISEMENT','')
            
        # Ignore advertisement tag
        elif (ingred == 'ADVERTISEMENT'):
            continue
        
        # If all text, make sure no advertisement tag
        else:
            matchtext = matchtext.replace(' ADVERTISEMENT','')
        
        # Tokenize ingredient list
        tokenized = word_tokenize(matchtext)
        
        # Remove words likely to be stop words or measurements
        removed_stop = [w for w in tokenized if not w in measure_corpus]
        removed_stop = [w for w in removed_stop if not w in stop_words]
        
        # Filter adjectives and nouns
        ingred_words = lambda pos: pos[:2] in ['JJ','NN','NNS']
        ingreds = [word.lower() for (word, pos) in pos_tag(removed_stop) if ingred_words(pos)]
        
        # Convert to singular
        ingreds = [p.singular_noun(word) if p.singular_noun(word) else word for word in ingreds]
        
        # Remove common ingredients 
        common = ['water','salt','pepper']
        cleanedtext.append(ingreds)
        cleanedtext = [[ing for ing in ingreds if not any(word in common for word in ingreds)] for ingreds in cleanedtext]
        
        # Remove additional descriptors for long ingredient names
        cleanedtext = [ingreds[-2:] if len(ingreds) > 2 else ingreds for ingreds in cleanedtext]
   
    return [(' ').join(item) for item in cleanedtext if len(item)>0]

In [705]:
clean_recipe(df['ingredients'][10])

['chicken stock',
 'winter vegetable',
 'roasting chicken',
 'yellow onion',
 'carrot',
 'stalk leaf',
 'parsnip',
 'fresh parsley',
 'fresh thyme',
 'fresh dill',
 'garlic',
 'black peppercorn',
 'carrot',
 'parsnip',
 'sweet potato',
 'butternut squash',
 'olive oil',
 'flat-leaf parsley']

In [180]:
#regex test
# re.compile('(?<=\d )(?!.*\d)(.+?)(?=[^a-zA-Z ])').search('1 large sweet apple (such as Fuji), sliced into matchsticks ADVERTISEMENT').group(1)

In [236]:
df['ingredients'][2]

['1 1/2 cups dried black beans, picked over and rinsed',
 '8 cups water, plus 1/4 cup',
 '2 cloves garlic',
 '3 tablespoons vegetable oil',
 '1 large green bell pepper, finely chopped, reserving about 1 teaspoon for garnish',
 '1 small red onion, finely chopped, reserving about 1 teaspoon for garnish',
 '2 teaspoons ground cumin',
 '2 tablespoons cider vinegar',
 'Salt and pepper',
 '1 plum tomato, finely chopped',
 '1 small red chile, finely chopped',
 'Fresh cilantro leaves, for garnish',
 'Dollop sour cream, for garnish',
 'Serving Suggestion: Tortilla chips']

In [195]:
# Clean recipes across dataframe

df['ingreds_list'] = df['ingredients'].apply(clean_recipe)

In [198]:
df['ingreds_list'].head(20)

0     [celery, green onion, parsley, crabmeat, crack...
1     [skirt steak, applewood bacon, red onion, jala...
2     [black bean, clove garlic, vegetable oil, red ...
3     [ground chuck, tomato sauce, egg noodle, sour ...
4     [rice, 1/2-cup quinoa, white tuna, seaweed kel...
5     [olive oil, italian eggplant, hummu hummu, gra...
6     [saffron thread, sugar, orange zest, carrot, u...
7     [hot-dog bun, unsalted butter, lobster lobster...
8     [olive oil, onion, pine nut, white rice, tomat...
9     [cauliflower, clove garlic, unsalted butter, b...
10    [chicken stock, winter vegetable, roasting chi...
11    [chili sauce, rice vinegar, fish sauce, fresh ...
12                        [cheese product, green chily]
13    [london broil, olive oil, worcestershire sauce...
14    [duck, garlic clove, cinnamon stick, fat lard,...
15    [purpose flour, cornmeal, baking powder, sugar...
16    [boneles, egg, bread crumb, deli ham, wine sau...
17    [tortilla chip, neely bbq, pork, bbq sauce

In [202]:
list_ingreds = df['ingreds_list']

In [220]:
combinations = []

for lists in tqdm(list_ingreds):
    combinations = combinations + list(itertools.combinations(lists,2))
    
combinations_df = pd.DataFrame(combinations)

100%|██████████| 122971/122971 [11:20:42<00:00,  3.01it/s]     


In [678]:
# List of top 200 ingredients on both sides of pairings

combinations_df.columns = ['ingred1','ingred2']
top_ingred1 = combinations_df['ingred1'].value_counts()[:200]
top_ingred2 = combinations_df['ingred2'].value_counts()[:200]

In [679]:
# Create new list of ranked ingredients by combining both sides of pairing list

top_ingreds = pd.DataFrame(top_ingred1).join(pd.DataFrame(top_ingred2),how='outer').fillna(0)
top_ingreds['total'] = top_ingreds['ingred1'] + top_ingreds['ingred2']
top_ingreds = top_ingreds.sort_values(by="total", ascending=False)['total']
top_ingreds = top_ingreds.reset_index() 
top_ingreds.columns = ['ingredient','total']

In [722]:
len(combinations_df)

5839145

In [680]:
top_ingreds_list = list(top_ingreds['ingredient'])

In [681]:
top_ingreds_list

['olive oil',
 'sugar',
 'egg',
 'all-purpose flour',
 'clove garlic',
 'butter',
 'unsalted butter',
 'onion',
 'vanilla extract',
 'vegetable oil',
 'brown sugar',
 'lemon juice',
 'milk',
 'white sugar',
 'garlic clove',
 'baking powder',
 'ground cinnamon',
 'carrot',
 'heavy cream',
 'garlic',
 'red onion',
 'baking soda',
 'bay leaf',
 'shallot',
 'wine vinegar',
 'honey',
 'chicken broth',
 'soy sauce',
 'tomato',
 'canola oil',
 'sour cream',
 'white wine',
 'ground cumin',
 'egg yolk',
 'lemon',
 'lime juice',
 'chicken stock',
 'dijon mustard',
 'confectioner sugar',
 'green onion',
 'scallion',
 'fresh parsley',
 'mayonnaise',
 'fresh thyme',
 'red bell',
 'paprika',
 'fresh cilantro',
 'cornstarch',
 'worcestershire sauce',
 'whole milk',
 'garlic powder',
 'egg white',
 'chili powder',
 'cream cheese',
 'parmesan cheese',
 'cilantro leaf',
 'dried oregano',
 'parsley leaf',
 'fresh ginger',
 'basil leaf',
 'flour',
 'parmesan',
 'orange juice',
 'ground nutmeg',
 'stick bu

In [682]:
# Filter for combinations with top ingredients

filtered_combinations = []

for x in tqdm(combinations_df.itertuples()):
    if x[1] in top_ingreds_list and x[2] in top_ingreds_list:
        filtered_combinations.append(x[1:3])

filtered_combinations_df = pd.DataFrame(filtered_combinations, columns=['ingred1','ingred2'])


0it [00:00, ?it/s][A
2955it [00:00, 7950.52it/s][A
8243it [00:00, 9707.03it/s][A
24762it [00:00, 13526.53it/s][A
45703it [00:00, 18803.08it/s][A
68706it [00:00, 25952.34it/s][A
88241it [00:01, 35077.58it/s][A
112407it [00:01, 47176.06it/s][A
137563it [00:01, 62380.45it/s][A
161651it [00:01, 80212.00it/s][A
183081it [00:01, 97418.09it/s][A
203934it [00:01, 114707.75it/s][A
228734it [00:01, 136758.15it/s][A
254225it [00:01, 158845.54it/s][A
279778it [00:01, 179183.42it/s][A
304101it [00:01, 194551.68it/s][A
329266it [00:02, 208761.26it/s][A
353530it [00:02, 216313.99it/s][A
377602it [00:02, 203106.00it/s][A
399786it [00:02, 203052.23it/s][A
423438it [00:02, 212053.48it/s][A
447799it [00:02, 220626.08it/s][A
470667it [00:02, 217245.33it/s][A
492967it [00:02, 204535.92it/s][A
513976it [00:02, 205300.59it/s][A
536128it [00:03, 209910.77it/s][A
557424it [00:03, 210814.60it/s][A
579287it [00:03, 213098.64it/s][A
600750it [00:03, 212408.91it/s][A
627399it [00:03, 

In [683]:
filtered_combinations_df.head()

Unnamed: 0,ingred1,ingred2
0,celery,green onion
1,celery,parsley
2,celery,dry mustard
3,celery,hot sauce
4,celery,heavy cream


In [684]:
# Get count of recipes each top ingredient appears in
# Later determines radius/color of network

top_ingreds_recipes = dict.fromkeys(top_ingreds_list, 0)

for ingred in tqdm(top_ingreds_list):
    for lst in df['ingreds_list']:
        if ingred in lst:
            top_ingreds_recipes[ingred] += 1

top_ingreds_recipes


  0%|          | 0/229 [00:00<?, ?it/s][A
  1%|          | 2/229 [00:00<00:16, 13.84it/s][A
  2%|▏         | 4/229 [00:00<00:16, 14.01it/s][A
  3%|▎         | 6/229 [00:00<00:15, 14.26it/s][A
  4%|▍         | 9/229 [00:00<00:13, 15.95it/s][A
  5%|▍         | 11/229 [00:00<00:13, 16.76it/s][A
  6%|▌         | 14/229 [00:00<00:11, 18.07it/s][A
  7%|▋         | 17/229 [00:00<00:11, 19.16it/s][A
  9%|▊         | 20/229 [00:01<00:10, 20.09it/s][A
 10%|▉         | 22/229 [00:01<00:10, 19.79it/s][A
 11%|█         | 25/229 [00:01<00:09, 20.43it/s][A
 12%|█▏        | 28/229 [00:01<00:09, 20.56it/s][A
 14%|█▎        | 31/229 [00:01<00:09, 20.96it/s][A
 15%|█▍        | 34/229 [00:01<00:09, 21.45it/s][A
 16%|█▌        | 37/229 [00:01<00:08, 21.65it/s][A
 17%|█▋        | 40/229 [00:01<00:08, 21.87it/s][A
 19%|█▉        | 43/229 [00:02<00:08, 21.24it/s][A
 20%|██        | 46/229 [00:02<00:08, 21.41it/s][A
 21%|██▏       | 49/229 [00:02<00:08, 21.53it/s][A
 23%|██▎       | 52/229 

{'all-purpose flour': 19562,
 'almond': 2182,
 'almond extract': 1064,
 'apple': 1182,
 'avocado': 1371,
 'bacon': 1152,
 'baking powder': 7949,
 'baking soda': 6250,
 'balsamic vinegar': 2800,
 'banana': 1231,
 'basil leaf': 2945,
 'bay leaf': 4656,
 'beef broth': 980,
 'bittersweet chocolate': 1192,
 'black bean': 1280,
 'black olive': 1121,
 'black peppercorn': 1633,
 'blue cheese': 770,
 'boneles': 904,
 'bread crumb': 2988,
 'brown sugar': 11073,
 'butter': 19296,
 'buttermilk': 2460,
 'butternut squash': 699,
 'cake flour': 1000,
 'cake mix': 825,
 'canola oil': 3871,
 'caper': 1440,
 'carrot': 6529,
 'cayenne': 828,
 'celery': 1277,
 'celery stalk': 885,
 'cheddar cheese': 3096,
 'cherry tomato': 1423,
 'chicken': 809,
 'chicken breast': 1212,
 'chicken broth': 5640,
 'chicken stock': 3494,
 'chile powder': 672,
 'chili powder': 2840,
 'chive': 1079,
 'chocolate chip': 2696,
 'chopped onion': 2469,
 'cider vinegar': 2731,
 'cilantro': 1410,
 'cilantro leaf': 2367,
 'cinnamon': 1

In [685]:
top_ingreds_recipes_df = pd.DataFrame(list(zip(top_ingreds_recipes.keys(),top_ingreds_recipes.values())), columns = ['ingredient','recipes'])

top_ingreds = pd.merge(top_ingreds, top_ingreds_recipes_df)

In [686]:
top_ingreds['total'] = top_ingreds['total'].astype('int')

In [687]:
top_ingreds

Unnamed: 0,ingredient,total,recipes
0,olive oil,377461,32429
1,sugar,288310,21456
2,egg,257308,24435
3,all-purpose flour,218468,19562
4,clove garlic,216540,17952
5,butter,205882,19296
6,unsalted butter,184737,14839
7,onion,169772,15351
8,vanilla extract,142113,11957
9,vegetable oil,126008,11680


### Categories to rename or combine

lime + lime wedge
lemon + lemon wedge
vegetable oil + oil
potato + russet potato
lemon zest + lemon peel
lemon juice + juice lemon
skinles + boneles + chicken breast
sugar + white sugar
clove garlic + garlic clove + garlic
butter + unsalted butter + stick butter
chicken broth + chicken stock
parmesan cheese + parmesan
onion + yellow onion + chopped onion + diced onion
stalk celery + celery + celery stalk + rib celery
all-purpose flour + purpose flour
fresh ginger + ginger
slice bacon + bacon
cilantro leaf + fresh cilantro + cilantro
parsley leaf + flat-leaf parsley + fresh parsley
basil leaf + fresh basil
mint leaf + fresh mint
thyme leaf + fresh thyme

In [688]:
# Function to combine rows with similar names into single row with aggregated column

def combine_row(df, column1, aggcolumn1, aggcolumn2, commonvals, newname):
    tot1 = 0
    tot2 = 0
    df = df.append(pd.DataFrame([[newname,0,0]], columns=df.columns),ignore_index=True)
    for val in commonvals:
        tot1 = tot1 + df[aggcolumn1][df[column1] == val].values
        tot2 = tot2 + df[aggcolumn2][df[column1] == val].values
        df = df[df[column1] != val]
    df.loc[df[column1] == newname, aggcolumn1] = tot1
    df.loc[df[column1] == newname, aggcolumn2] = tot2
    return df

In [689]:
# Combine similar ingredients and remove unecesssary text

top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['lime','lime wedge'], 'lime_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['lemon','lemon wedge'], 'lemon_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['vegetable oil','oil'], 'vegetable oil_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['potato','russet potato'], 'potato_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['lemon zest','lemon peel'], 'lemon zest_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['lemon juice','juice lemon'], 'lemon juice_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['skinles','boneles','chicken breast'], 'chicken breast_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['slice bacon','bacon'], 'bacon_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['stalk celery', 'celery', 'celery stalk', 'rib celery'], 'celery_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['clove garlic','garlic clove','garlic'], 'garlic_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['sugar', 'white sugar'], 'sugar_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['butter','unsalted butter','stick butter'], 'butter_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['chicken broth', 'chicken stock'], 'chicken stock_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['onion', 'yellow onion', 'chopped onion', 'diced onion'], 'onion_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['parmesan cheese', 'parmesan'], 'parmesan_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['all-purpose flour','purpose flour'], 'all-purpose flour_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['fresh ginger','ginger'], 'ginger_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['cilantro leaf','fresh cilantro','cilantro'], 'cilantro_')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['basil leaf','fresh basil'], 'basil')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['thyme leaf','fresh thyme'], 'thyme')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['parsley leaf', 'flat-leaf parsley', 'fresh parsley'], 'parsley')
top_ingreds = combine_row(top_ingreds, 'ingredient', 'total', 'recipes', ['mint leaf','fresh mint'], 'mint_')

top_ingreds['ingredient'] = top_ingreds['ingredient'].str.replace('_', '')

top_ingreds = top_ingreds.sort_values(by="total", ascending=False).reset_index().drop('index',axis=1)
top_ingreds

Unnamed: 0,ingredient,total,recipes
0,butter,421929,36975
1,garlic,384999,32539
2,sugar,383695,31492
3,olive oil,377461,32429
4,egg,257308,24435
5,all-purpose flour,239413,21384
6,onion,229482,20845
7,vanilla extract,142113,11957
8,vegetable oil,138582,12645
9,lemon juice,127903,12743


In [690]:
top_ingreds = top_ingreds.drop_duplicates()
top_ingreds

Unnamed: 0,ingredient,total,recipes
0,butter,421929,36975
1,garlic,384999,32539
2,sugar,383695,31492
3,olive oil,377461,32429
4,egg,257308,24435
5,all-purpose flour,239413,21384
6,onion,229482,20845
7,vanilla extract,142113,11957
8,vegetable oil,138582,12645
9,lemon juice,127903,12743


In [691]:
combinations_df.to_csv('all_combinations.csv')
top_ingreds.to_csv('cleaned_top_ingreds.csv')
filtered_combinations_df.to_csv('filtered_combinations.csv')

In [692]:
# Count number of overlapping edges
filtered_combinations_df['ingred1'] = filtered_combinations_df['ingred1'].str.replace('clove garlic', 'garlic').replace('garlic clove','garlic').replace('white sugar','sugar').replace('unsalted butter','butter').replace('stick butter','butter').replace('chicken broth','chicken stock').replace('parmesan cheese','parmesan').replace('yellow onion','onion').replace('chopped onion','onion').replace('stalk celery','celery').replace('purpose flour','all-purpose flour').replace('fresh ginger','ginger').replace('slice bacon','bacon').replace('cilantro leaf','cilantro').replace('fresh cilantro','cilantro').replace('parsley leaf','parsley').replace('flat-leaf parsley','parsley').replace('fresh parsley','parsley').replace('basil leaf','basil').replace('fresh basil','basil').replace('thyme leaf','thyme').replace('fresh thyme','thyme').replace('boneles','chicken breast').replace('skinles','chicken breast').replace('celery stalk','celery').replace('diced onion','onion').replace('fresh mint','mint').replace('juice lemon','lemon juice').replace('lemon peel','lemon zest').replace('lemon wedge','lemon').replace('lime wedge','lime').replace('russet potato','potato').replace('mint leaf','mint').replace('oil','vegetable oil').replace('rib celery','celery')
filtered_combinations_df['ingred2'] = filtered_combinations_df['ingred2'].str.replace('clove garlic', 'garlic').replace('garlic clove','garlic').replace('white sugar','sugar').replace('unsalted butter','butter').replace('stick butter','butter').replace('chicken broth','chicken stock').replace('parmesan cheese','parmesan').replace('yellow onion','onion').replace('chopped onion','onion').replace('stalk celery','celery').replace('purpose flour','all-purpose flour').replace('fresh ginger','ginger').replace('slice bacon','bacon').replace('cilantro leaf','cilantro').replace('fresh cilantro','cilantro').replace('parsley leaf','parsley').replace('flat-leaf parsley','parsley').replace('fresh parsley','parsley').replace('basil leaf','basil').replace('fresh basil','basil').replace('thyme leaf','thyme').replace('fresh thyme','thyme').replace('boneles','chicken breast').replace('skinles','chicken breast').replace('celery stalk','celery').replace('diced onion','onion').replace('fresh mint','mint').replace('juice lemon','lemon juice').replace('lemon peel','lemon zest').replace('lemon wedge','lemon').replace('lime wedge','lime').replace('russet potato','potato').replace('mint leaf','mint').replace('oil','vegetable oil').replace('rib celery','celery')
filtered_combinations_df.head()

Unnamed: 0,ingred1,ingred2
0,celery,green onion
1,celery,parsley
2,celery,dry mustard
3,celery,hot sauce
4,celery,heavy cream


In [721]:
# Check if list of unique ingredients agree

set(top_ingreds['ingredient']) - set(filtered_combinations_df['ingred1'])

196

In [559]:
def swap_ingred(df, tup):
    index = tup[0]
    temp = df.loc[index,'ingred1']
    df.at[index, 'ingred1'] = df.loc[index, 'ingred2']
    df.at[index, 'ingred2'] = temp
    return df

In [570]:
# check 6 and 71

# for tup1 in tqdm(filtered_combinations_df.itertuples()):
#     for tup2 in filtered_combinations_df.loc[filtered_combinations_df.index > tup1[0]].itertuples():
#         if (tup1[1] == tup2[2]) & (tup1[2] == tup2[1]):
#             filtered_combinations_df = swap_ingred(filtered_combinations_df, tup2)

35it [00:54,  1.55s/it]

KeyboardInterrupt: 

In [556]:
test_tups = filtered_combinations_df[0:20]
test_tups = test_tups.append(pd.DataFrame([['butter','parsley']], columns = ['ingred1','ingred2']))
test_tups = test_tups.reset_index().drop('index', axis=1)
test_tups

Unnamed: 0,ingred1,ingred2
0,green onion,parsley
1,green onion,hot sauce
2,green onion,heavy cream
3,green onion,butter
4,parsley,hot sauce
5,parsley,heavy cream
6,parsley,butter
7,hot sauce,heavy cream
8,hot sauce,butter
9,heavy cream,butter


In [567]:
for tup1 in test_tups.itertuples():
    for tup2 in test_tups.loc[test_tups.index > tup1[0]].itertuples():
        if (tup1[1] == tup2[2]) & (tup1[2] == tup2[1]):
            test_tups = swap_ingred(test_tups, tup2)

In [568]:
test_tups

Unnamed: 0,ingred1,ingred2
0,green onion,parsley
1,green onion,hot sauce
2,green onion,heavy cream
3,green onion,butter
4,parsley,hot sauce
5,parsley,heavy cream
6,parsley,butter
7,hot sauce,heavy cream
8,hot sauce,butter
9,heavy cream,butter


In [550]:
test_tups.loc[6,'ingred1'] == test_tups.loc[20,'ingred2']

True

In [551]:
test_tups.loc[6,'ingred2'] == test_tups.loc[20,'ingred1']

True

In [696]:
filtered_combinations_df['pairs'] = list(zip(filtered_combinations_df['ingred1'], filtered_combinations_df['ingred2']))
filtered_combinations_df.head()

Unnamed: 0,ingred1,ingred2,pairs
0,celery,green onion,"(celery, green onion)"
1,celery,parsley,"(celery, parsley)"
2,celery,dry mustard,"(celery, dry mustard)"
3,celery,hot sauce,"(celery, hot sauce)"
4,celery,heavy cream,"(celery, heavy cream)"


In [697]:
# Group by pairing and get counts each pair occurs

filtered_pairs_df = filtered_combinations_df.groupby('pairs').agg({'ingred1':['count','first'],
                                                                  'ingred2':'first'}).reset_index()
filtered_pairs_df.columns = filtered_pairs_df.columns.droplevel(1)
filtered_pairs_df.columns = ['pairs','counts','ingred1','ingred2']
filtered_pairs_df.head()

Unnamed: 0,pairs,counts,ingred1,ingred2
0,"(all-purpose flour, all-purpose flour)",1471,all-purpose flour,all-purpose flour
1,"(all-purpose flour, almond)",480,all-purpose flour,almond
2,"(all-purpose flour, almond extract)",405,all-purpose flour,almond extract
3,"(all-purpose flour, apple)",203,all-purpose flour,apple
4,"(all-purpose flour, avocado)",41,all-purpose flour,avocado


In [698]:
# Drop same ingredients

for tup in tqdm(filtered_pairs_df.itertuples()):
    if tup[1][0] == tup[1][1]:
        filtered_pairs_df = filtered_pairs_df.drop(tup[0])


0it [00:00, ?it/s][A
1710it [00:00, 15787.01it/s][A
3034it [00:00, 14797.95it/s][A
4237it [00:00, 13802.86it/s][A
6044it [00:00, 14715.47it/s][A
7441it [00:00, 14289.24it/s][A
8751it [00:00, 13823.73it/s][A
10625it [00:00, 14946.62it/s][A
12717it [00:00, 16310.22it/s][A
15053it [00:00, 17627.57it/s][A
16855it [00:01, 17696.96it/s][A
18886it [00:01, 18164.54it/s][A
21030it [00:01, 18892.07it/s][A
22934it [00:01, 18432.28it/s][A
24790it [00:01, 16872.57it/s][A
26512it [00:01, 14942.54it/s][A
28071it [00:01, 14406.02it/s][A
30003it [00:01, 15442.99it/s][A
32061it [00:01, 16564.48it/s][A
33929it [00:02, 16619.90it/s][A

In [699]:
filtered_pairs_df.shape

(33735, 4)

In [700]:
# Clean up same pairs but different ordered

for tup1 in tqdm(filtered_pairs_df.itertuples()):
    for tup2 in filtered_pairs_df.loc[filtered_pairs_df.index > tup1[0]].itertuples():
        if (tup1[3] == tup2[4]) and (tup1[4] == tup2[3]):
            filtered_pairs_df.loc[tup1[0], 'counts'] += filtered_pairs_df.loc[tup2[0], 'counts']
            filtered_pairs_df = filtered_pairs_df.drop(tup2[0])


0it [00:00, ?it/s][A
2it [00:00, 12.56it/s][A
4it [00:00, 13.56it/s][A
6it [00:00, 14.79it/s][A
9it [00:00, 16.44it/s][A
12it [00:00, 17.55it/s][A
15it [00:00, 18.55it/s][A
17it [00:00, 18.42it/s][A
19it [00:01, 18.57it/s][A
21it [00:01, 18.82it/s][A
23it [00:01, 18.78it/s][A
25it [00:01, 18.42it/s][A
27it [00:01, 18.62it/s][A
29it [00:01, 18.44it/s][A
31it [00:01, 18.32it/s][A
34it [00:01, 19.21it/s][A
36it [00:01, 19.23it/s][A
38it [00:02, 19.36it/s][A
40it [00:02, 18.54it/s][A
42it [00:02, 17.54it/s][A
45it [00:02, 18.63it/s][A
47it [00:02, 18.97it/s][A
49it [00:02, 19.11it/s][A
51it [00:02, 19.16it/s][A
53it [00:02, 19.25it/s][A
55it [00:02, 18.76it/s][A
57it [00:03, 18.91it/s][A
59it [00:03, 18.30it/s][A
61it [00:03, 18.10it/s][A
63it [00:03, 16.67it/s][A
65it [00:03, 14.41it/s][A
67it [00:03, 15.26it/s][A
69it [00:03, 16.02it/s][A
71it [00:03, 16.65it/s][A
73it [00:04, 17.33it/s][A
75it [00:04, 17.98it/s][A
77it [00:04, 17.84it/s][A
79it [00:

794it [00:38, 17.45it/s][A
796it [00:38, 17.48it/s][A
798it [00:39, 18.02it/s][A
800it [00:39, 18.18it/s][A
802it [00:39, 17.86it/s][A
804it [00:39, 17.96it/s][A
806it [00:39, 17.02it/s][A
809it [00:39, 18.37it/s][A
812it [00:39, 19.33it/s][A
814it [00:39, 17.52it/s][A
816it [00:40, 17.54it/s][A
818it [00:40, 16.91it/s][A
820it [00:40, 16.69it/s][A
822it [00:40, 16.17it/s][A
824it [00:40, 15.70it/s][A
826it [00:40, 16.45it/s][A
829it [00:40, 17.97it/s][A
832it [00:40, 19.10it/s][A
835it [00:41, 20.16it/s][A
838it [00:41, 19.41it/s][A
840it [00:41, 18.35it/s][A
843it [00:41, 19.54it/s][A
846it [00:41, 20.27it/s][A
849it [00:41, 20.26it/s][A
852it [00:41, 21.22it/s][A
855it [00:42, 18.94it/s][A
857it [00:42, 16.15it/s][A
859it [00:42, 16.61it/s][A
862it [00:42, 17.86it/s][A
865it [00:42, 19.02it/s][A
868it [00:42, 20.18it/s][A
871it [00:42, 18.88it/s][A
874it [00:43, 19.85it/s][A
877it [00:43, 19.63it/s][A
880it [00:43, 19.94it/s][A
883it [00:43, 20.76i

1629it [01:15, 23.08it/s][A
1632it [01:16, 23.10it/s][A
1635it [01:16, 23.16it/s][A
1638it [01:16, 23.14it/s][A
1641it [01:16, 23.03it/s][A
1644it [01:16, 23.29it/s][A
1647it [01:16, 23.40it/s][A
1650it [01:16, 23.29it/s][A
1653it [01:17, 23.41it/s][A
1656it [01:17, 23.59it/s][A
1659it [01:17, 23.87it/s][A
1662it [01:17, 23.52it/s][A
1665it [01:17, 23.38it/s][A
1668it [01:17, 23.35it/s][A
1671it [01:17, 23.46it/s][A
1674it [01:17, 22.37it/s][A
1677it [01:18, 22.37it/s][A
1680it [01:18, 23.31it/s][A
1683it [01:18, 23.20it/s][A
1686it [01:18, 22.97it/s][A
1689it [01:18, 23.47it/s][A
1692it [01:18, 23.66it/s][A
1696it [01:18, 25.48it/s][A
1699it [01:18, 26.26it/s][A
1702it [01:19, 26.00it/s][A
1705it [01:19, 25.23it/s][A
1708it [01:19, 24.38it/s][A
1711it [01:19, 24.04it/s][A
1714it [01:19, 24.08it/s][A
1717it [01:19, 23.89it/s][A
1720it [01:19, 24.05it/s][A
1723it [01:19, 23.83it/s][A
1726it [01:20, 23.66it/s][A
1729it [01:20, 23.47it/s][A
1732it [01:20,

2476it [01:51, 24.58it/s][A
2479it [01:51, 24.37it/s][A
2482it [01:51, 24.31it/s][A
2485it [01:51, 24.31it/s][A
2488it [01:51, 24.64it/s][A
2491it [01:51, 24.47it/s][A
2494it [01:52, 24.11it/s][A
2497it [01:52, 24.23it/s][A
2500it [01:52, 24.17it/s][A
2503it [01:52, 24.13it/s][A
2506it [01:52, 24.14it/s][A
2509it [01:52, 24.11it/s][A
2512it [01:52, 24.15it/s][A
2515it [01:52, 23.73it/s][A
2518it [01:53, 23.88it/s][A
2521it [01:53, 24.02it/s][A
2524it [01:53, 25.04it/s][A
2527it [01:53, 25.99it/s][A
2530it [01:53, 26.71it/s][A
2533it [01:53, 26.76it/s][A
2536it [01:53, 25.48it/s][A
2539it [01:53, 25.25it/s][A
2542it [01:53, 24.87it/s][A
2545it [01:54, 24.65it/s][A
2548it [01:54, 24.14it/s][A
2551it [01:54, 23.88it/s][A
2554it [01:54, 24.14it/s][A
2557it [01:54, 23.56it/s][A
2560it [01:54, 23.88it/s][A
2563it [01:54, 23.90it/s][A
2566it [01:54, 24.50it/s][A
2569it [01:55, 24.56it/s][A
2572it [01:55, 24.33it/s][A
2575it [01:55, 24.24it/s][A
2578it [01:55,

3335it [02:25, 24.58it/s][A
3338it [02:25, 24.57it/s][A
3341it [02:25, 24.69it/s][A
3344it [02:25, 24.78it/s][A
3347it [02:26, 24.95it/s][A
3350it [02:26, 24.70it/s][A
3353it [02:26, 24.79it/s][A
3356it [02:26, 24.50it/s][A
3359it [02:26, 24.77it/s][A
3362it [02:26, 24.68it/s][A
3365it [02:26, 24.90it/s][A
3368it [02:26, 25.03it/s][A
3371it [02:27, 25.11it/s][A
3374it [02:27, 24.96it/s][A
3377it [02:27, 24.85it/s][A
3380it [02:27, 24.97it/s][A
3383it [02:27, 25.36it/s][A
3387it [02:27, 27.23it/s][A
3391it [02:27, 28.65it/s][A
3395it [02:27, 29.77it/s][A
3399it [02:28, 30.86it/s][A
3403it [02:28, 31.22it/s][A
3407it [02:28, 29.35it/s][A
3410it [02:28, 28.07it/s][A
3413it [02:28, 27.22it/s][A
3416it [02:28, 26.32it/s][A
3419it [02:28, 25.85it/s][A
3422it [02:28, 25.35it/s][A
3425it [02:29, 25.03it/s][A
3428it [02:29, 24.84it/s][A
3431it [02:29, 24.76it/s][A
3434it [02:29, 24.67it/s][A
3437it [02:29, 24.74it/s][A
3440it [02:29, 24.39it/s][A
3443it [02:29,

4238it [02:59, 28.13it/s][A
4241it [02:59, 26.62it/s][A
4244it [02:59, 26.01it/s][A
4247it [02:59, 25.90it/s][A
4250it [02:59, 25.94it/s][A
4253it [02:59, 25.85it/s][A
4256it [02:59, 25.42it/s][A
4259it [02:59, 25.60it/s][A
4262it [03:00, 25.97it/s][A
4265it [03:00, 25.55it/s][A
4268it [03:00, 26.07it/s][A
4271it [03:00, 26.34it/s][A
4274it [03:00, 26.14it/s][A
4277it [03:00, 25.76it/s][A
4280it [03:00, 26.02it/s][A
4283it [03:00, 26.20it/s][A
4286it [03:01, 26.25it/s][A
4289it [03:01, 26.07it/s][A
4292it [03:01, 26.07it/s][A
4295it [03:01, 26.31it/s][A
4298it [03:01, 25.87it/s][A
4301it [03:01, 26.00it/s][A
4304it [03:01, 26.26it/s][A
4307it [03:01, 26.12it/s][A
4310it [03:01, 26.15it/s][A
4313it [03:02, 25.97it/s][A
4316it [03:02, 25.28it/s][A
4319it [03:02, 25.67it/s][A
4322it [03:02, 25.96it/s][A
4325it [03:02, 26.09it/s][A
4328it [03:02, 26.13it/s][A
4331it [03:02, 26.07it/s][A
4334it [03:02, 25.94it/s][A
4337it [03:02, 25.59it/s][A
4340it [03:03,

5128it [03:31, 28.59it/s][A
5131it [03:32, 26.01it/s][A
5134it [03:32, 23.51it/s][A
5137it [03:32, 21.83it/s][A
5140it [03:32, 19.68it/s][A
5143it [03:32, 17.39it/s][A
5145it [03:32, 17.25it/s][A
5147it [03:33, 14.79it/s][A
5149it [03:33, 14.93it/s][A
5151it [03:33, 13.82it/s][A
5153it [03:33, 14.77it/s][A
5155it [03:33, 15.26it/s][A
5157it [03:33, 15.17it/s][A
5159it [03:33, 15.07it/s][A
5161it [03:34, 15.27it/s][A
5163it [03:34, 15.15it/s][A
5165it [03:34, 16.22it/s][A
5167it [03:34, 15.82it/s][A
5169it [03:34, 13.96it/s][A
5171it [03:34, 13.76it/s][A
5173it [03:34, 13.35it/s][A
5175it [03:35, 14.80it/s][A
5178it [03:35, 16.41it/s][A
5181it [03:35, 17.59it/s][A
5183it [03:35, 18.15it/s][A
5185it [03:35, 18.56it/s][A
5188it [03:35, 19.61it/s][A
5191it [03:35, 20.63it/s][A
5194it [03:35, 21.41it/s][A
5197it [03:36, 21.90it/s][A
5200it [03:36, 20.75it/s][A
5203it [03:36, 19.22it/s][A
5205it [03:36, 19.20it/s][A
5207it [03:36, 18.42it/s][A
5209it [03:36,

5994it [04:06, 31.28it/s][A
5998it [04:06, 31.62it/s][A
6003it [04:06, 33.98it/s][A
6007it [04:06, 31.96it/s][A
6011it [04:06, 26.92it/s][A
6014it [04:06, 22.94it/s][A
6018it [04:06, 24.93it/s][A
6021it [04:07, 26.01it/s][A
6024it [04:07, 26.85it/s][A
6028it [04:07, 28.37it/s][A
6032it [04:07, 29.63it/s][A
6036it [04:07, 31.07it/s][A
6040it [04:07, 32.61it/s][A
6044it [04:07, 32.08it/s][A
6048it [04:07, 31.71it/s][A
6052it [04:08, 32.15it/s][A
6056it [04:08, 32.08it/s][A
6060it [04:08, 32.41it/s][A
6064it [04:08, 31.67it/s][A
6068it [04:08, 30.58it/s][A
6072it [04:08, 29.71it/s][A
6075it [04:08, 27.32it/s][A
6078it [04:08, 27.17it/s][A
6081it [04:09, 27.90it/s][A
6084it [04:09, 27.64it/s][A
6087it [04:09, 27.40it/s][A
6090it [04:09, 27.12it/s][A
6093it [04:09, 26.39it/s][A
6097it [04:09, 27.61it/s][A
6101it [04:09, 28.86it/s][A
6104it [04:09, 28.81it/s][A
6107it [04:09, 28.74it/s][A
6110it [04:10, 28.14it/s][A
6113it [04:10, 27.81it/s][A
6116it [04:10,

7033it [04:39, 37.78it/s][A
7037it [04:39, 36.83it/s][A
7041it [04:39, 37.12it/s][A
7045it [04:39, 34.79it/s][A
7049it [04:39, 32.76it/s][A
7053it [04:39, 30.89it/s][A
7057it [04:39, 31.19it/s][A
7061it [04:39, 30.71it/s][A
7065it [04:40, 30.48it/s][A
7069it [04:40, 30.80it/s][A
7073it [04:40, 31.56it/s][A
7077it [04:40, 31.33it/s][A
7081it [04:40, 31.12it/s][A
7085it [04:40, 30.83it/s][A
7089it [04:40, 30.52it/s][A
7093it [04:40, 30.38it/s][A
7097it [04:41, 30.80it/s][A
7101it [04:41, 31.45it/s][A
7105it [04:41, 31.88it/s][A
7109it [04:41, 31.33it/s][A
7113it [04:41, 30.93it/s][A
7117it [04:41, 31.09it/s][A
7121it [04:41, 30.91it/s][A
7125it [04:42, 30.95it/s][A
7129it [04:42, 30.82it/s][A
7133it [04:42, 29.93it/s][A
7137it [04:42, 29.52it/s][A
7141it [04:42, 30.14it/s][A
7145it [04:42, 30.21it/s][A
7149it [04:42, 30.64it/s][A
7153it [04:42, 30.21it/s][A
7157it [04:43, 30.26it/s][A
7161it [04:43, 30.13it/s][A
7165it [04:43, 29.98it/s][A
7169it [04:43,

8164it [05:13, 31.68it/s][A
8168it [05:13, 31.36it/s][A
8172it [05:13, 31.67it/s][A
8176it [05:13, 32.58it/s][A
8181it [05:14, 35.13it/s][A
8186it [05:14, 37.10it/s][A
8191it [05:14, 38.72it/s][A
8196it [05:14, 40.12it/s][A
8201it [05:14, 41.17it/s][A
8206it [05:14, 41.98it/s][A
8211it [05:14, 43.12it/s][A
8216it [05:14, 43.12it/s][A
8221it [05:14, 40.63it/s][A
8226it [05:15, 38.19it/s][A
8230it [05:15, 37.58it/s][A
8234it [05:15, 35.66it/s][A
8238it [05:15, 34.20it/s][A
8242it [05:15, 34.19it/s][A
8246it [05:15, 33.67it/s][A
8250it [05:15, 33.57it/s][A
8254it [05:15, 33.36it/s][A
8258it [05:16, 33.19it/s][A
8262it [05:16, 32.95it/s][A
8266it [05:16, 32.61it/s][A
8270it [05:16, 32.86it/s][A
8274it [05:16, 33.53it/s][A
8278it [05:16, 33.52it/s][A
8282it [05:16, 33.21it/s][A
8286it [05:16, 32.93it/s][A
8290it [05:17, 32.96it/s][A
8294it [05:17, 33.41it/s][A
8298it [05:17, 33.04it/s][A
8302it [05:17, 32.59it/s][A
8306it [05:17, 32.47it/s][A
8310it [05:17,

9370it [05:47, 36.44it/s][A
9375it [05:47, 38.25it/s][A
9380it [05:47, 40.48it/s][A
9385it [05:47, 42.05it/s][A
9390it [05:47, 43.71it/s][A
9395it [05:47, 44.64it/s][A
9400it [05:47, 45.39it/s][A
9405it [05:47, 46.16it/s][A
9410it [05:48, 46.88it/s][A
9415it [05:48, 46.95it/s][A
9420it [05:48, 46.47it/s][A
9425it [05:48, 42.28it/s][A
9430it [05:48, 39.95it/s][A
9435it [05:48, 38.38it/s][A
9439it [05:48, 37.81it/s][A
9443it [05:48, 36.29it/s][A
9447it [05:49, 35.89it/s][A
9451it [05:49, 35.33it/s][A
9455it [05:49, 34.96it/s][A
9459it [05:49, 34.78it/s][A
9463it [05:49, 34.29it/s][A
9467it [05:49, 34.38it/s][A
9471it [05:49, 34.54it/s][A
9475it [05:49, 34.38it/s][A
9479it [05:49, 34.51it/s][A
9483it [05:50, 34.41it/s][A
9487it [05:50, 34.53it/s][A
9491it [05:50, 34.39it/s][A
9495it [05:50, 34.27it/s][A
9499it [05:50, 34.11it/s][A
9503it [05:50, 33.62it/s][A
9507it [05:50, 34.71it/s][A
9511it [05:50, 34.68it/s][A
9515it [05:50, 35.15it/s][A
9519it [05:51,

10608it [06:18, 37.31it/s][A
10612it [06:18, 36.77it/s][A
10616it [06:18, 36.55it/s][A
10620it [06:19, 36.58it/s][A
10624it [06:19, 36.05it/s][A
10628it [06:19, 35.79it/s][A
10632it [06:19, 35.70it/s][A
10636it [06:19, 36.22it/s][A
10640it [06:19, 37.11it/s][A
10644it [06:19, 36.49it/s][A
10648it [06:19, 36.34it/s][A
10652it [06:19, 36.68it/s][A
10656it [06:20, 36.52it/s][A
10660it [06:20, 36.95it/s][A
10664it [06:20, 35.58it/s][A
10668it [06:20, 36.35it/s][A
10672it [06:20, 36.46it/s][A
10676it [06:20, 36.65it/s][A
10681it [06:20, 39.48it/s][A
10686it [06:20, 42.14it/s][A
10692it [06:20, 44.90it/s][A
10698it [06:21, 47.22it/s][A
10704it [06:21, 49.37it/s][A
10710it [06:21, 50.03it/s][A
10716it [06:21, 50.43it/s][A
10722it [06:21, 51.25it/s][A
10728it [06:21, 52.33it/s][A
10734it [06:21, 47.88it/s][A
10739it [06:21, 45.22it/s][A
10744it [06:22, 43.62it/s][A
10749it [06:22, 41.69it/s][A
10754it [06:22, 40.45it/s][A
10759it [06:22, 40.32it/s][A
10764it [0

12057it [06:50, 46.10it/s][A
12062it [06:50, 44.76it/s][A
12067it [06:50, 45.60it/s][A
12072it [06:50, 45.65it/s][A
12077it [06:51, 45.30it/s][A
12082it [06:51, 45.24it/s][A
12087it [06:51, 45.53it/s][A
12092it [06:51, 44.77it/s][A
12097it [06:51, 45.18it/s][A
12102it [06:51, 45.85it/s][A
12107it [06:51, 46.03it/s][A
12113it [06:51, 48.89it/s][A
12120it [06:51, 52.24it/s][A
12126it [06:52, 54.27it/s][A
12132it [06:52, 55.53it/s][A
12138it [06:52, 56.51it/s][A
12145it [06:52, 57.91it/s][A
12151it [06:52, 58.51it/s][A
12158it [06:52, 59.51it/s][A
12164it [06:52, 55.14it/s][A
12170it [06:52, 52.32it/s][A
12176it [06:52, 49.71it/s][A
12182it [06:53, 47.84it/s][A
12187it [06:53, 47.80it/s][A
12192it [06:53, 47.13it/s][A
12197it [06:53, 46.60it/s][A
12202it [06:53, 45.10it/s][A
12207it [06:53, 45.56it/s][A
12212it [06:53, 45.97it/s][A
12217it [06:53, 45.75it/s][A
12222it [06:53, 44.74it/s][A
12227it [06:54, 44.89it/s][A
12232it [06:54, 45.08it/s][A
12237it [0

13655it [07:21, 50.82it/s][A
13661it [07:21, 49.67it/s][A
13667it [07:21, 50.07it/s][A
13673it [07:21, 49.29it/s][A
13679it [07:22, 51.76it/s][A
13686it [07:22, 55.84it/s][A
13693it [07:22, 58.32it/s][A
13700it [07:22, 60.23it/s][A
13707it [07:22, 62.20it/s][A
13714it [07:22, 63.87it/s][A
13722it [07:22, 66.05it/s][A
13729it [07:22, 67.12it/s][A
13736it [07:22, 67.38it/s][A
13743it [07:22, 64.56it/s][A
13750it [07:23, 59.22it/s][A
13757it [07:23, 56.08it/s][A
13763it [07:23, 53.38it/s][A
13769it [07:23, 52.51it/s][A
13775it [07:23, 51.64it/s][A
13781it [07:23, 50.51it/s][A
13787it [07:23, 49.63it/s][A
13792it [07:23, 48.92it/s][A
13797it [07:24, 48.94it/s][A
13802it [07:24, 48.26it/s][A
13807it [07:24, 47.43it/s][A
13812it [07:24, 47.61it/s][A
13817it [07:24, 46.96it/s][A
13822it [07:24, 47.17it/s][A
13827it [07:24, 46.98it/s][A
13832it [07:24, 47.62it/s][A
13837it [07:24, 47.57it/s][A
13844it [07:25, 52.46it/s][A
13851it [07:25, 56.22it/s][A
13858it [0

15562it [07:52, 56.93it/s][A
15569it [07:53, 60.12it/s][A
15578it [07:53, 65.78it/s][A
15585it [07:53, 66.82it/s][A
15592it [07:53, 64.57it/s][A
15599it [07:53, 66.09it/s][A
15607it [07:53, 68.58it/s][A
15615it [07:53, 70.57it/s][A
15623it [07:53, 72.32it/s][A
15631it [07:53, 73.30it/s][A
15639it [07:53, 74.73it/s][A
15647it [07:54, 70.54it/s][A
15655it [07:54, 64.78it/s][A
15662it [07:54, 60.93it/s][A
15669it [07:54, 58.12it/s][A
15675it [07:54, 55.83it/s][A
15681it [07:54, 55.62it/s][A
15688it [07:54, 57.14it/s][A
15694it [07:54, 56.44it/s][A
15700it [07:55, 56.86it/s][A
15706it [07:55, 56.48it/s][A
15712it [07:55, 56.90it/s][A
15718it [07:55, 56.16it/s][A
15724it [07:55, 55.58it/s][A
15730it [07:55, 54.85it/s][A
15737it [07:55, 58.13it/s][A
15746it [07:55, 63.93it/s][A
15754it [07:55, 67.50it/s][A
15763it [07:56, 72.69it/s][A
15773it [07:56, 78.97it/s][A
15783it [07:56, 83.64it/s][A
15793it [07:56, 87.18it/s][A
15803it [07:56, 90.35it/s][A
15813it [0

17909it [08:24, 66.80it/s][A
17916it [08:24, 64.12it/s][A
17923it [08:24, 63.46it/s][A
17930it [08:24, 58.95it/s][A
17937it [08:24, 60.41it/s][A
17944it [08:24, 61.35it/s][A
17954it [08:24, 69.35it/s][A
17965it [08:24, 77.52it/s][A
17977it [08:24, 86.02it/s][A
17989it [08:25, 93.01it/s][A
18000it [08:25, 96.94it/s][A
18011it [08:25, 98.65it/s][A
18023it [08:25, 102.58it/s][A
18034it [08:25, 104.13it/s][A
18045it [08:25, 105.40it/s][A
18056it [08:25, 93.79it/s] [A
18066it [08:25, 83.31it/s][A
18075it [08:26, 76.56it/s][A
18084it [08:26, 73.80it/s][A
18092it [08:26, 70.68it/s][A
18100it [08:26, 69.38it/s][A
18108it [08:26, 68.38it/s][A
18115it [08:26, 66.83it/s][A
18122it [08:26, 66.22it/s][A
18129it [08:26, 65.76it/s][A
18136it [08:26, 65.23it/s][A
18146it [08:27, 72.47it/s][A
18158it [08:27, 81.17it/s][A
18169it [08:27, 87.06it/s][A
18180it [08:27, 92.02it/s][A
18191it [08:27, 95.99it/s][A
18203it [08:27, 100.53it/s][A
18214it [08:27, 102.97it/s][A
1822

20964it [08:55, 69.10it/s][A
20972it [08:56, 63.42it/s][A
20980it [08:56, 65.08it/s][A
20989it [08:56, 70.67it/s][A
20999it [08:56, 76.34it/s][A
21008it [08:56, 78.07it/s][A
21018it [08:56, 82.35it/s][A
21028it [08:56, 85.26it/s][A
21038it [08:56, 88.28it/s][A
21048it [08:56, 88.05it/s][A
21057it [08:57, 86.13it/s][A
21066it [08:57, 85.94it/s][A
21076it [08:57, 88.94it/s][A
21085it [08:57, 85.29it/s][A
21094it [08:57, 82.36it/s][A
21103it [08:57, 69.06it/s][A
21111it [08:57, 63.56it/s][A
21118it [08:57, 59.72it/s][A
21125it [08:58, 57.25it/s][A
21131it [08:58, 56.07it/s][A
21137it [08:58, 54.73it/s][A
21143it [08:58, 52.58it/s][A
21149it [08:58, 51.46it/s][A
21155it [08:58, 49.88it/s][A
21161it [08:58, 50.81it/s][A
21170it [08:58, 57.41it/s][A
21179it [08:59, 63.88it/s][A
21188it [08:59, 69.44it/s][A
21197it [08:59, 74.36it/s][A
21206it [08:59, 77.10it/s][A
21216it [08:59, 80.47it/s][A
21226it [08:59, 83.92it/s][A
21235it [08:59, 79.82it/s][A
21245it [0

25301it [09:27, 224.30it/s][A
25325it [09:28, 177.63it/s][A
25346it [09:28, 156.05it/s][A
25373it [09:28, 178.07it/s][A
25400it [09:28, 197.21it/s][A
25427it [09:28, 213.79it/s][A
25456it [09:28, 230.83it/s][A
25482it [09:28, 234.66it/s][A
25507it [09:28, 182.59it/s][A
25528it [09:29, 166.27it/s][A
25556it [09:29, 188.73it/s][A
25585it [09:29, 210.53it/s][A
25615it [09:29, 231.16it/s][A
25644it [09:29, 244.67it/s][A
25671it [09:29, 242.55it/s][A
25697it [09:29, 189.44it/s][A
25719it [09:30, 185.24it/s][A
25749it [09:30, 208.76it/s][A
25779it [09:30, 228.84it/s][A
25810it [09:30, 246.95it/s][A
25839it [09:30, 257.93it/s][A
25867it [09:30, 201.51it/s][A
25891it [09:30, 186.62it/s][A
25921it [09:30, 209.76it/s][A
25953it [09:30, 233.79it/s][A
25985it [09:31, 253.28it/s][A
26016it [09:31, 266.79it/s][A
26045it [09:31, 218.24it/s][A
26070it [09:31, 181.61it/s][A
26100it [09:31, 204.70it/s][A
26130it [09:31, 225.68it/s][A
26160it [09:31, 243.08it/s][A
26191it 

In [701]:
filtered_pairs_df.shape

(17833, 4)

In [702]:
filtered_pairs_df.head(200)

Unnamed: 0,pairs,counts,ingred1,ingred2
1,"(all-purpose flour, almond)",657,all-purpose flour,almond
2,"(all-purpose flour, almond extract)",634,all-purpose flour,almond extract
3,"(all-purpose flour, apple)",356,all-purpose flour,apple
4,"(all-purpose flour, avocado)",53,all-purpose flour,avocado
5,"(all-purpose flour, bacon)",411,all-purpose flour,bacon
6,"(all-purpose flour, baking powder)",6721,all-purpose flour,baking powder
7,"(all-purpose flour, baking soda)",5199,all-purpose flour,baking soda
8,"(all-purpose flour, balsamic vinegar)",141,all-purpose flour,balsamic vinegar
9,"(all-purpose flour, banana)",247,all-purpose flour,banana
10,"(all-purpose flour, basil)",466,all-purpose flour,basil


In [703]:
filtered_pairs_df.to_csv('filtered_pairs.csv')

In [704]:
len(df['ingreds_list'])

122971