In [1]:
import re
import sys
import json
import numpy as np
import pandas as pd
from pprint import pprint

In [2]:
with open('/Users/ed/Development/Healthi/HealthieBackend/resources/data/recipe_collection.json', 'r') as f:
    data = json.load(f)
len(data)

2224

In [3]:
labels = []
for record in data:
    labels.append(data[record]['label'])

In [4]:
ingredient_lists = []
uris = []

for record in data:
    uris.append(record)
    ingredient_lists.append(data[record]['ingredientLines'])

In [5]:
len(labels), len(ingredient_lists)

(2224, 2224)

In [6]:
scrubbed_ingredients = []
measurers = re.compile(r' (cups|cup|kg|tsp|sp|ounces|grams|tb|gram|ounce|mililiters|qt|tablespoon|tablespoons|teaspoon|tbs|tbsp|oz|can|large|small|pinch|bag|lb|ml|tub) ')
descriptors = re.compile('(slice[d]?|about|finely|chopped|new|all-purpose|all purpose|thinly|thin|plus|optionalcrushed|crumbles|ground|minced|powder|chopped|sliced)')
ammounts = re.compile(r'\d+(\/\d+|\.\d+)?') # TODO use these to weight vector averaging?
stopwords = re.compile(r' (or|the|other|of|to|on|for) ')
nonascii = re.compile(r'[^\x00-\x7F]+')
multiplespaces = re.compile(' +')

for ingredient_list in ingredient_lists:
    scrubbed_lines = []
    for line in ingredient_list:
        line = ' ' + line.lower() + ' '
        
        line = line.replace('*', ' ')
        line = line.replace(':', ' ')
        line = line.replace(',', ' ')
        line = line.replace('.', ' ')
        line = line.replace(')', ' ')
        line = line.replace('(', ' ')
        
        line = nonascii.sub('', line)
        line = measurers.sub('', line)
        line = descriptors.sub('', line)
        line = ammounts.sub('', line) 
        line = stopwords.sub('', line) 
        line = multiplespaces.sub(' ', line).strip()
        
        line = ' '.join( [w for w in line.split() if len(w)>1] )
        
        scrubbed_lines.append(line)
        
    scrubbed_ingredients.append(scrubbed_lines)

In [7]:
idx = 14
for i in range(len(ingredient_lists[idx])):
    print(f'{ingredient_lists[idx][i]:<60} | {scrubbed_ingredients[idx][i]}')

2 cups instant rice (such as Minute Rice)                    | instant rice such as minute rice
1/2 cup vanilla ice cream                                    | vanilla ice cream
1/2 teaspoon ground cinnamon, plus more for garnishing (optional) | cinnamon moregarnishing optional


In [8]:
from collections import Counter
vocab = Counter((line for ingredientlist in scrubbed_ingredients for line in ingredientlist))
sorted_vocab = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
len(sorted_vocab)

8742

In [9]:
sorted_vocab[:100]

[('salt', 539),
 ('eggs', 382),
 ('olive oil', 305),
 ('sugar', 285),
 ('flour', 249),
 ('water', 207),
 ('butter', 154),
 ('onion', 148),
 ('kosher salt', 140),
 ('milk', 134),
 ('cinnamon', 129),
 ('vanilla extract', 126),
 ('unsalted butter', 113),
 ('baking', 112),
 ('vegetable oil', 111),
 ('egg', 95),
 ('heavy cream', 93),
 ('baking soda', 93),
 ('freshly black pepper', 90),
 ('cloves garlic', 89),
 ('extra-virgin olive oil', 85),
 ('pepper', 75),
 ('granulated sugar', 72),
 ('cumin', 67),
 ('garlic cloves', 65),
 ('clove garlic', 65),
 ('lemon juice', 61),
 ('whole milk', 59),
 ('fresh lemon juice', 59),
 ('soy sauce', 58),
 ('black pepper', 57),
 ('garlic', 56),
 ('cooking spray', 54),
 ('garlic clove', 52),
 ('salt and pepper', 52),
 ('scallions', 52),
 ('teaspoons baking', 52),
 ('honey', 47),
 ('freshly pepper', 46),
 ('red onion', 46),
 ('buttermilk', 46),
 ('mayonnaise', 42),
 ('nutmeg', 42),
 ('egg whites', 42),
 ('green onions', 41),
 ('fresh parsley', 41),
 ('sour cream

In [10]:
sorted_vocab[-100:]

[('teaspoon cinnamon', 1),
 ("confectioners' sugar sprinkling", 1),
 ('-inch thick bread brioche challah pullman', 1),
 ('butter divided moreserving', 1),
 ('jam and maple syrupserving optional', 1),
 ('challah brioche white bread', 1),
 ('-ounce french bread loaf', 1),
 ('-ounce package frozen whole strawberries thawed', 1),
 ('garnish ed sugar', 1),
 ('matcha green tea', 1),
 ('each cinnamon and cardamom', 1),
 ('egg whites beaten', 1),
 ('french vanilla creamer', 1),
 ('thick-cut honey wheat bread', 1),
 ('salted butter pan-frying', 1),
 ('"-thick french bread', 1),
 ('-ounce package pecans', 1),
 ('multi-grain bread', 1),
 ('texas toastother -inch- white bread', 1),
 ('-ounce diagonally cut french bread inch thick', 1),
 ('loaf sturdy artisan-style bread', 1),
 ('low-fat milk %%', 1),
 ('whole grain cinnamon-raisin bread diagonally', 1),
 ('sugar-free maple syrup teaspoons strawberry jam', 1),
 ('egg beaters original', 1),
 ('pam original no-stick cooking spray', 1),
 ('healthy cho

In [11]:
len(list(filter(lambda x: x[1] > 1, sorted_vocab)))

1674

In [12]:
flat_ingredients = []
for ilist in scrubbed_ingredients:
    doc = []
    for line in ilist:
        doc = doc + line.split(' ')
    flat_ingredients.append(doc)

In [13]:
scrubbed_ingredients[0]

['long-grain white rice', 'water', 'salt', 'olive oil']

In [14]:
flat_ingredients[0]

['long-grain', 'white', 'rice', 'water', 'salt', 'olive', 'oil']

In [15]:
vocab = Counter((word for doc in flat_ingredients for word in doc))
sorted_vocab = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
len(sorted_vocab)

4852

In [16]:
from gensim.models import Word2Vec, FastText

In [17]:
ft = FastText(
    flat_ingredients,
    size=32,
    window=5,
    min_count=3,
    workers=4
)

In [18]:
ft.wv.most_similar('butter')

[('buttermilk', 0.9995287656784058),
 ('buttercooking', 0.9982272982597351),
 ('melted', 0.9981725215911865),
 ('unsalted', 0.9980701208114624),
 ('extract', 0.997821569442749),
 ('knobbutter', 0.997783899307251),
 ('-milk', 0.9975804090499878),
 ('buttercream', 0.9971997737884521),
 ("van's", 0.9971935749053955),
 ('sugar', 0.9969752430915833)]

In [19]:
ft.wv.most_similar('milk')

[('-milk', 0.9997034668922424),
 ('sugarcake', 0.9996215105056763),
 ('extractcake', 0.9994295239448547),
 ('buttercream', 0.9993956685066223),
 ('sodacake', 0.9991245269775391),
 ('butternut', 0.9990596771240234),
 ('granulated', 0.9989905953407288),
 ('cookie', 0.9989561438560486),
 ('butteroil', 0.9988692998886108),
 ('dark-brown', 0.9987518787384033)]

In [20]:
ft.wv.most_similar('tortilla')

[('ingredientsthe', 0.9998887181282043),
 ('strawberries', 0.9998645782470703),
 ('cranberries', 0.9998346567153931),
 ('family-size', 0.9998199939727783),
 ('head', 0.9998060464859009),
 ('the', 0.9998021125793457),
 ('fatty', 0.9997815489768982),
 ('ingredient', 0.99977707862854),
 ('brandy', 0.9997754096984863),
 ('apple', 0.999772310256958)]

In [21]:
ft.wv.most_similar('sausage')

[('sausages', 0.9998112320899963),
 ('sage', 0.9997497200965881),
 ('cabbage', 0.998917281627655),
 ('pork', 0.9987697005271912),
 ('through', 0.998754620552063),
 ('hard-cooked', 0.998735249042511),
 ('spicy', 0.9987351298332214),
 ('one', 0.9987209439277649),
 ('round', 0.9986944794654846),
 ('portions', 0.9986854791641235)]

In [22]:
ft.wv.most_similar('sugar')

[('sugarcake', 0.9993815422058105),
 ('buttercooking', 0.9992620348930359),
 ('-milk', 0.9991952180862427),
 ('extract', 0.9990784525871277),
 ('milk', 0.9986667633056641),
 ("van's", 0.9982786774635315),
 ('extractcake', 0.9980489015579224),
 ('sodacake', 0.9979407787322998),
 ('cookie', 0.9977536201477051),
 ('making', 0.9975316524505615)]

In [23]:
vectors = []
for flist in flat_ingredients:
    vectors.append(np.mean([ft.wv[ingredient] for ingredient in flist], axis=0))

In [24]:
assert len(uris) == len(vectors)
embeddings = { uris[i]:vectors[i] for i in range(len(vectors)) }

In [25]:
for recipe in data:
    data[recipe]['vec_repr'] = embeddings[recipe].tolist()

## food type labeling 

breakfast = 0
dinner = 1

In [26]:
type(labels)

list

In [32]:
# for i in range(0, len(uris), 30):
#     print(labels[i])
#     data[uris[i]]['food_type'] = input()

Rice
1
Morimoto's Sushi Rice
1
Eggy Fried Rice
1
Crisp Sushi-Rice Cakes
1
Chicken Marsala
1
Poached Chicken for Tacos
1
Chicken Marsala
1
California Pizza Kitchen's Jack-O-Lantern Pizza Recipe
1
Grape Pizza
1
Salami and Mushroom Pizza
1
Cakespy: Turn Leftover Christmas Cookies Into Croutons
1
Molasses Cookies With Marshmallows and Peaches
1
Cranberry Chocolate Marshmallow Cookies
1
American Hero Cookies
1
Black and Tan Brownies
1
Orange-Kissed Brownies
1
Peanut Butter Espresso Brownies
1
Salmon Spread
1
Potato & Salmon Grill
1
Orange-Jalapeno Salmon
1
Creamed Spinach
1
Spinach Cocktail
1
Spinach Spaetzle
1
Spanish Spinach Omelette
1
Vegetable Casserole
1
Poached Eggs


KeyboardInterrupt: 

In [33]:
food_type_labels = []
for i, recipe in enumerate(data):
    if 'food_type' in data[recipe]:
        food_type_labels.append(int(data[recipe]['food_type']))
    else:
        food_type_labels.append(-1)

food_type_labels = np.array(food_type_labels)

In [34]:
(food_type_labels == 1).sum()

86

In [35]:
(food_type_labels == 0).sum()

5

In [None]:
food_type_labels

In [36]:
# with open('/Users/ed/Development/Healthi/HealthieBackend/resources/data/recipe_collection.json', 'w') as fp:
#     json.dump(data, fp)