In [1]:
import re
import sys
import json
import numpy as np
import pandas as pd
from pprint import pprint

In [2]:
with open('/Users/ed/Development/Healthi/HealthieBackend/resources/data/recipe_collection.json', 'r') as f:
    data = json.load(f)
len(data)

2224

In [3]:
labels = []
for record in data:
    labels.append(data[record]['label'])

In [4]:
ingredient_lists = []
uris = []

for record in data:
    uris.append(record)
    ingredient_lists.append(data[record]['ingredientLines'])

In [5]:
len(labels), len(ingredient_lists)

(2224, 2224)

In [6]:
scrubbed_ingredients = []
measurers = re.compile(r' (cups|cup|kg|tsp|sp|ounces|grams|tb|gram|ounce|mililiters|qt|tablespoon|tablespoons|teaspoon|tbs|tbsp|oz|can|large|small|pinch|bag|lb|ml|tub) ')
descriptors = re.compile('(slice[d]?|about|finely|chopped|new|all-purpose|all purpose|thinly|thin|plus|optionalcrushed|crumbles|ground|minced|powder|chopped|sliced)')
ammounts = re.compile(r'\d+(\/\d+|\.\d+)?') # TODO use these to weight vector averaging?
stopwords = re.compile(r' (or|the|other|of|to|on|for) ')
nonascii = re.compile(r'[^\x00-\x7F]+')
multiplespaces = re.compile(' +')

for ingredient_list in ingredient_lists:
    scrubbed_lines = []
    for line in ingredient_list:
        line = ' ' + line.lower() + ' '
        
        line = line.replace('*', ' ')
        line = line.replace(':', ' ')
        line = line.replace(',', ' ')
        line = line.replace('.', ' ')
        line = line.replace(')', ' ')
        line = line.replace('(', ' ')
        
        line = nonascii.sub('', line)
        line = measurers.sub('', line)
        line = descriptors.sub('', line)
        line = ammounts.sub('', line) 
        line = stopwords.sub('', line) 
        line = multiplespaces.sub(' ', line).strip()
        
        line = ' '.join( [w for w in line.split() if len(w)>1] )
        
        scrubbed_lines.append(line)
        
    scrubbed_ingredients.append(scrubbed_lines)

In [7]:
idx = 14
for i in range(len(ingredient_lists[idx])):
    print(f'{ingredient_lists[idx][i]:<60} | {scrubbed_ingredients[idx][i]}')

2 cups instant rice (such as Minute Rice)                    | instant rice such as minute rice
1/2 cup vanilla ice cream                                    | vanilla ice cream
1/2 teaspoon ground cinnamon, plus more for garnishing (optional) | cinnamon moregarnishing optional


In [8]:
from collections import Counter
vocab = Counter((line for ingredientlist in scrubbed_ingredients for line in ingredientlist))
sorted_vocab = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
len(sorted_vocab)

8742

In [9]:
sorted_vocab[:100]

[('salt', 539),
 ('eggs', 382),
 ('olive oil', 305),
 ('sugar', 285),
 ('flour', 249),
 ('water', 207),
 ('butter', 154),
 ('onion', 148),
 ('kosher salt', 140),
 ('milk', 134),
 ('cinnamon', 129),
 ('vanilla extract', 126),
 ('unsalted butter', 113),
 ('baking', 112),
 ('vegetable oil', 111),
 ('egg', 95),
 ('heavy cream', 93),
 ('baking soda', 93),
 ('freshly black pepper', 90),
 ('cloves garlic', 89),
 ('extra-virgin olive oil', 85),
 ('pepper', 75),
 ('granulated sugar', 72),
 ('cumin', 67),
 ('garlic cloves', 65),
 ('clove garlic', 65),
 ('lemon juice', 61),
 ('whole milk', 59),
 ('fresh lemon juice', 59),
 ('soy sauce', 58),
 ('black pepper', 57),
 ('garlic', 56),
 ('cooking spray', 54),
 ('garlic clove', 52),
 ('salt and pepper', 52),
 ('scallions', 52),
 ('teaspoons baking', 52),
 ('honey', 47),
 ('freshly pepper', 46),
 ('red onion', 46),
 ('buttermilk', 46),
 ('mayonnaise', 42),
 ('nutmeg', 42),
 ('egg whites', 42),
 ('green onions', 41),
 ('fresh parsley', 41),
 ('sour cream

In [10]:
sorted_vocab[-100:]

[('teaspoon cinnamon', 1),
 ("confectioners' sugar sprinkling", 1),
 ('-inch thick bread brioche challah pullman', 1),
 ('butter divided moreserving', 1),
 ('jam and maple syrupserving optional', 1),
 ('challah brioche white bread', 1),
 ('-ounce french bread loaf', 1),
 ('-ounce package frozen whole strawberries thawed', 1),
 ('garnish ed sugar', 1),
 ('matcha green tea', 1),
 ('each cinnamon and cardamom', 1),
 ('egg whites beaten', 1),
 ('french vanilla creamer', 1),
 ('thick-cut honey wheat bread', 1),
 ('salted butter pan-frying', 1),
 ('"-thick french bread', 1),
 ('-ounce package pecans', 1),
 ('multi-grain bread', 1),
 ('texas toastother -inch- white bread', 1),
 ('-ounce diagonally cut french bread inch thick', 1),
 ('loaf sturdy artisan-style bread', 1),
 ('low-fat milk %%', 1),
 ('whole grain cinnamon-raisin bread diagonally', 1),
 ('sugar-free maple syrup teaspoons strawberry jam', 1),
 ('egg beaters original', 1),
 ('pam original no-stick cooking spray', 1),
 ('healthy cho

In [11]:
len(list(filter(lambda x: x[1] > 1, sorted_vocab)))

1674

In [12]:
flat_ingredients = []
for ilist in scrubbed_ingredients:
    doc = []
    for line in ilist:
        doc = doc + line.split(' ')
    flat_ingredients.append(doc)

In [13]:
scrubbed_ingredients[0]

['long-grain white rice', 'water', 'salt', 'olive oil']

In [14]:
flat_ingredients[0]

['long-grain', 'white', 'rice', 'water', 'salt', 'olive', 'oil']

In [15]:
vocab = Counter((word for doc in flat_ingredients for word in doc))
sorted_vocab = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
len(sorted_vocab)

4852

In [16]:
from gensim.models import Word2Vec, FastText

In [17]:
ft = FastText(
    flat_ingredients,
    size=32,
    window=5,
    min_count=3,
    workers=4
)

In [18]:
ft.wv.most_similar('butter')

[('buttermilk', 0.9996306896209717),
 ('unsalted', 0.9987801313400269),
 ('buttercream', 0.99875807762146),
 ('buttercooking', 0.9986683130264282),
 ('egg', 0.9982967972755432),
 ('melted', 0.9980528354644775),
 ('sugarcake', 0.9980248212814331),
 ('extractcake', 0.9980005025863647),
 ('pure', 0.9979914426803589),
 ('knobbutter', 0.9979079961776733)]

In [19]:
ft.wv.most_similar('milk')

[('extractthe', 0.9994943737983704),
 ('cake', 0.999474287033081),
 ('-milk', 0.9994641542434692),
 ('granulated', 0.9993367195129395),
 ('flour', 0.9991613626480103),
 ('flourcake', 0.9991430044174194),
 ('cookies', 0.9990295767784119),
 ('butternonstick', 0.9989939332008362),
 ('light-brown', 0.9989777207374573),
 ('butternut', 0.9989717602729797)]

In [20]:
ft.wv.most_similar('tortilla')

[('strawberries', 0.9998306035995483),
 ('walnut', 0.9998068809509277),
 ('cranberries', 0.9997902512550354),
 ('fatty', 0.999786913394928),
 ('waffles', 0.9997825622558594),
 ('nutella', 0.9997705221176147),
 ('courgettes', 0.9997624158859253),
 ('brandy', 0.9997563362121582),
 ('corkscrew', 0.9997532963752747),
 ('raspberry', 0.9997437596321106)]

In [21]:
ft.wv.most_similar('sausage')

[('sage', 0.999760627746582),
 ('sausages', 0.9997080564498901),
 ('cabbage', 0.9991718530654907),
 ('precooked', 0.9990386962890625),
 ('sambal', 0.9990004301071167),
 ('enough', 0.9989926218986511),
 ('fully-cooked', 0.9989751577377319),
 ('roast', 0.9989744424819946),
 ('uncooked', 0.998968243598938),
 ('muffin', 0.9989447593688965)]

In [22]:
ft.wv.most_similar('sugar')

[('sugarcake', 0.9997738599777222),
 ('sodacake', 0.9992762207984924),
 ('extractcake', 0.9992440938949585),
 ('buttercooking', 0.9990527629852295),
 ('making', 0.9988678097724915),
 ('egg', 0.9988588690757751),
 ('cooking', 0.9988409876823425),
 ("van's", 0.9986138343811035),
 ('dark-brown', 0.9986035823822021),
 ('-milk', 0.9984099864959717)]

In [23]:
vectors = []
for flist in flat_ingredients:
    vectors.append(np.mean([ft.wv[ingredient] for ingredient in flist], axis=0))

In [24]:
assert len(uris) == len(vectors)
embeddings = { uris[i]:vectors[i] for i in range(len(vectors)) }

In [25]:
for recipe in data:
    data[recipe]['vec_repr'] = embeddings[recipe].tolist()

## food type labeling 

breakfast = 0
dinner = 1

In [26]:
type(labels)

list

In [42]:
for i in range(1600, len(uris), 10):
    print(labels[i])
    data[uris[i]]['food_type'] = input()

"Eat Your Vegetables!": Green Bean Salad with Red Onion and Tomato
1
Marinated Chickpea Salad with Radishes and Cucumber


KeyboardInterrupt: 

In [43]:
food_type_labels = []
for i, recipe in enumerate(data):
    if 'food_type' in data[recipe]:
        food_type_labels.append(int(data[recipe]['food_type']))
    else:
        food_type_labels.append(-1)

food_type_labels = np.array(food_type_labels)

In [44]:
(food_type_labels == 0).sum(), (food_type_labels == 1).sum()

(406, 130)

In [45]:
food_type_labels

array([ 1, -1, -1, ...,  0,  0,  0])

In [50]:
len(vectors), food_type_labels.shape

(2224, (2224,))

In [51]:
from sklearn.semi_supervised import LabelPropagation
label_prop_model = LabelPropagation()
label_prop_model.fit(vectors, food_type_labels)

LabelPropagation(gamma=20, kernel='rbf', max_iter=1000, n_jobs=None,
                 n_neighbors=7, tol=0.001)

In [52]:
new_labels = label_prop_model.predict(vectors)

In [54]:
(new_labels == 0).sum(), (new_labels == 1).sum()

(2216, 8)

In [55]:
for i in data:
    print(i)
    break

http://www.edamam.com/ontologies/edamam.owl#recipe_b1957a6a4025b25f6da6aef1fad452d4


In [32]:
# with open('/Users/ed/Development/Healthi/HealthieBackend/resources/data/recipe_collection.json', 'w') as fp:
#     json.dump(data, fp)