In [1]:
import re
import sys
import json
import numpy as np
import pandas as pd
from pprint import pprint

In [2]:
with open('recipe_collection.json', 'r') as f:
    data = json.load(f)
len(data)

1827

In [3]:
labels = []
for record in data:
    labels.append(data[record]['label'])

In [4]:
ingredient_lists = []
for record in data:
    ingredient_lists.append(data[record]['ingredientLines'])

In [5]:
len(labels), len(ingredient_lists)

(1827, 1827)

In [233]:
scrubbed_ingredients = []
measurers = re.compile(r' (cups|cup|kg|tsp|sp|ounces|grams|tb|gram|ounce|mililiters|qt|tablespoon|tablespoons|teaspoon|tbs|tbsp|oz|can|large|small|pinch|bag|lb|ml|tub) ')
descriptors = re.compile('(slice[d]?|about|finely|chopped|new|all-purpose|all purpose|thinly|thin|plus|optionalcrushed|crumbles|ground|minced|powder|chopped|sliced)')
ammounts = re.compile(r'\d+(\/\d+|\.\d+)?') # TODO use these to weight vector averaging?
stopwords = re.compile(r' (or|the|other|of|to|on|for) ')
nonascii = re.compile(r'[^\x00-\x7F]+')
multiplespaces = re.compile(' +')

for ingredient_list in ingredient_lists:
    scrubbed_lines = []
    for line in ingredient_list:
        line = ' ' + line.lower() + ' '
        
        line = line.replace('*', ' ')
        line = line.replace(':', ' ')
        line = line.replace(',', ' ')
        line = line.replace('.', ' ')
        line = line.replace(')', ' ')
        line = line.replace('(', ' ')
        
        line = nonascii.sub('', line)
        line = measurers.sub('', line)
        line = descriptors.sub('', line)
        line = ammounts.sub('', line) 
        line = stopwords.sub('', line) 
        line = multiplespaces.sub(' ', line).strip()
        
        line = ' '.join( [w for w in line.split() if len(w)>1] )
        
        scrubbed_lines.append(line)
        
    scrubbed_ingredients.append(scrubbed_lines)

In [234]:
idx = 14
for i in range(len(ingredient_lists[idx])):
    print(f'{ingredient_lists[idx][i]:<60} | {scrubbed_ingredients[idx][i]}')

2 cups instant rice (such as Minute Rice)                    | instant rice such as minute rice
1/2 cup vanilla ice cream                                    | vanilla ice cream
1/2 teaspoon ground cinnamon, plus more for garnishing (optional) | cinnamon moregarnishing optional


In [235]:
from collections import Counter
vocab = Counter((line for ingredientlist in scrubbed_ingredients for line in ingredientlist))
sorted_vocab = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
len(sorted_vocab)

7457

In [236]:
sorted_vocab[:100]

[('salt', 447),
 ('olive oil', 280),
 ('eggs', 223),
 ('sugar', 207),
 ('flour', 187),
 ('water', 171),
 ('onion', 135),
 ('kosher salt', 119),
 ('butter', 115),
 ('vegetable oil', 92),
 ('cloves garlic', 85),
 ('freshly black pepper', 83),
 ('extra-virgin olive oil', 81),
 ('vanilla extract', 80),
 ('unsalted butter', 75),
 ('milk', 73),
 ('baking', 72),
 ('heavy cream', 69),
 ('egg', 68),
 ('pepper', 65),
 ('cumin', 64),
 ('garlic cloves', 63),
 ('clove garlic', 58),
 ('granulated sugar', 57),
 ('lemon juice', 56),
 ('black pepper', 54),
 ('baking soda', 53),
 ('soy sauce', 52),
 ('cinnamon', 51),
 ('fresh lemon juice', 51),
 ('garlic', 50),
 ('garlic clove', 48),
 ('scallions', 48),
 ('salt and pepper', 46),
 ('freshly pepper', 44),
 ('mayonnaise', 42),
 ('red onion', 41),
 ('fresh parsley', 39),
 ('cooking spray', 39),
 ('fresh cilantro', 38),
 ('whole milk', 38),
 ('kosher salt and freshly black pepper', 37),
 ('green onions', 37),
 ('sour cream', 35),
 ('chicken broth', 34),
 ('t

In [237]:
sorted_vocab[-100:]

[('tomato passata', 1),
 ('beaten egg brushing', 1),
 ('dark muscovado sugar', 1),
 ('saffron threads steeped in hot waterat least minutes', 1),
 ('the saffron custard', 1),
 ('the tomato sauce', 1),
 ('tomato pound', 1),
 ('seeded jalapeo pepper', 1),
 ('vegetable stock crumbled stock cube', 1),
 ('few basil leaves', 1),
 ('handfulbasil leaves optional', 1),
 ('tins plum tomato drained', 1),
 ('fresh garlic', 1),
 ('seeded peeled tomato', 1),
 ('low-sodium tomato juice', 1),
 ('basil leaves optional', 1),
 ('-inch-thick diagonally cut french bread baguette', 1),
 ('packetpizza dough mix made up', 1),
 ('capers rinsed and drained', 1),
 ('taleggio cheese', 1),
 ('-ounce part-skim mozzarella', 1),
 ('tintomato puree', 1),
 ('tinstomatoes', 1),
 ('tubmascarpone', 1),
 ('bunch fresh basil somegarnish', 1),
 ('tubsour cream', 1),
 ('roundedsalt', 1),
 ('sultana', 1),
 ('cooking apple', 1),
 ('light muscovado sugar', 1),
 ('litre jar spiced pickling vinegar', 1),
 ('-ounce part-skim mozzare

In [238]:
len(list(filter(lambda x: x[1] > 1, sorted_vocab)))

1418

In [258]:
flat_ingredients = []
for ilist in scrubbed_ingredients:
    doc = []
    for line in ilist:
        doc = doc + line.split(' ')
    flat_ingredients.append(doc)

In [259]:
scrubbed_ingredients[0]

['long-grain white rice', 'water', 'salt', 'olive oil']

In [260]:
flat_ingredients[0]

['long-grain', 'white', 'rice', 'water', 'salt', 'olive', 'oil']

In [261]:
vocab = Counter((word for doc in flat_ingredients for word in doc))
sorted_vocab = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
len(sorted_vocab)

4300

In [286]:
from gensim.models import Word2Vec, FastText

In [267]:
len(documents), len(labels)

(1827, 1827)

In [268]:
ft = FastText(
    flat_ingredients,
    size=32,
    window=5,
    min_count=3,
    workers=4
)

In [283]:
ft.wv.most_similar('butter')

[('buttermilk', 0.9999380111694336),
 ('buttercream', 0.9997366070747375),
 ('butternut', 0.9996468424797058),
 ('pure', 0.9994291067123413),
 ('butternonstick', 0.9993642568588257),
 ('granulated', 0.9993410706520081),
 ('sugarcake', 0.9992921948432922),
 ('sodacake', 0.9992570877075195),
 ('cake', 0.9992561936378479),
 ('mixture', 0.9992228746414185)]

In [282]:
ft.wv.most_similar('milk')

[('eggs', 0.9997900724411011),
 ('sweetened', 0.9997531175613403),
 ('bittersweet', 0.9997435212135315),
 ('icing', 0.999705970287323),
 ('quick-cooking', 0.9997024536132812),
 ('flour', 0.9996886253356934),
 ('cream', 0.9996626377105713),
 ('make', 0.9996609687805176),
 ('making', 0.9996545910835266),
 ('dusting', 0.9996429681777954)]

In [281]:
ft.wv.most_similar('tortilla')

[('whole', 0.9999223947525024),
 ('low-fatnonfat', 0.9999113082885742),
 ('whole-wheat', 0.9999065399169922),
 ('raspberries', 0.9999063014984131),
 ('strawberries', 0.9999051094055176),
 ('beaten', 0.9999011754989624),
 ('chorizo', 0.9998979568481445),
 ('cranberries', 0.9998958110809326),
 ('wheat', 0.9998924136161804),
 ('creamed', 0.9998872876167297)]

In [280]:
ft.wv.most_similar('sausage')

[('sausages', 0.9999262690544128),
 ('sage', 0.9998476505279541),
 ('breads', 0.9994004368782043),
 ('breasts', 0.9994001388549805),
 ('breast', 0.9993953704833984),
 ('toast', 0.9993199706077576),
 ('pork', 0.9993152618408203),
 ('breadcrumbs', 0.9993128776550293),
 ('bread', 0.9993095993995667),
 ('breakfast', 0.9993026256561279)]

In [284]:
ft.wv.most_similar('sugar')

[('granulated', 0.9997829794883728),
 ('sugarcake', 0.9997773170471191),
 ('egg', 0.9997565746307373),
 ('unsweetened', 0.9997162818908691),
 ('softened', 0.9996815323829651),
 ('sodacake', 0.9996802806854248),
 ('baking', 0.9996529221534729),
 ('chocolate', 0.9996036291122437),
 ('cake', 0.9995745420455933),
 ('cooking', 0.99956214427948)]

In [None]:
w2v = Word2Vec(
    
)

In [262]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(flat_ingredients)]
documents[0]

TaggedDocument(words=['long-grain', 'white', 'rice', 'water', 'salt', 'olive', 'oil'], tags=[0])

In [297]:
d2v = Doc2Vec(
    documents,
    vector_size=32,
    window=3,
    min_count=1,
    workers=4,
    epochs=40
)
d2v.train(documents, total_examples=d2v.corpus_count, epochs=model.epochs)

In [285]:
d2v.wv.most_similar(['butter'])

[('sticks', 0.9994559288024902),
 ('stick', 0.9993504285812378),
 ('unsalted', 0.9993353486061096),
 ('european-style', 0.9990919828414917),
 ('softened', 0.999055027961731),
 ('egg', 0.9990079402923584),
 ('melted', 0.9990049004554749),
 ('cookies', 0.9988588690757751),
 ('eggs', 0.998816967010498),
 ('heavy', 0.9987289905548096)]

In [301]:
model.infer_vector(documents[0].words)

array([ 0.0149958 ,  0.01081163, -0.01309573, -0.02105867, -0.00119628,
        0.00126277,  0.00197936, -0.00545138,  0.01015568, -0.00405203,
       -0.00777297, -0.01053615, -0.0094697 ,  0.00676797, -0.01012329,
        0.00471275,  0.00381594, -0.0008285 ,  0.01533036,  0.00118851,
       -0.00912933,  0.00848218, -0.02350868,  0.00286595,  0.00446338,
       -0.00521705,  0.00781466, -0.00470494,  0.00164694, -0.00220502,
        0.01477924,  0.00778789], dtype=float32)