In [16]:
import json
import nltk
import inflect
import tqdm
import time
import requests 
from bs4 import BeautifulSoup as bs
import gensim
import urllib.request
import collections
engine = inflect.engine()
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/luke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/luke/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
recipes = json.load(open("../data/1M/recipe1M_layers/layer1.json"))
det_ingr = json.load(open("../data/1M/det_ingr.json"))

In [3]:
def string_to_float(x) :
    try :
        x = x.split('/')
        if len(x) > 1 :            
            return (float(x[0])/float(x[1]))
        else :
            return float(x[0])
    except :
        raise ValueError("not possible to cast ", x, "to float")
    
def string_to_frac(x) :
    try :
        if '/' in x:
            return string_to_float(x)
        else :
            return(float(x[0]) / float(x[1:]))
    except :
        raise ValueError("not possible to cast ", x, "to float")
    
def singularize(x):
    ingr = engine.singular_noun(x)
    return x if (not ingr) else ingr

def fmt_unit(x) :
    return singularize(x.lower())

In [4]:
units = ["bushel", "cup", "dash", "drop", "fl." "oz", "g", "cc", "gram", "gallon", "glass",
    "kg", "liter", "ml", "ounce", "c.", "pinch", "pint", "pound", "lb", "quart",
    "scoop", "shot", "tablespoon", "teaspoon", "tsp", "tbsp"]

def extract_quantity(tags, recipe_index, ingredient_index) : 
    try :
        ingr  = det_ingr[recipe_index]['ingredients'][ingredient_index]['text']
        ingr_first_word = ingr.split(" ")[0]

        if ((len(tags) >= 2) and (tags[0][1] == 'CD')) :

            #nb (nb+ unit) ing
            if ((tags[1][0] == '(') and (len(tags)>6)):
                idx_par = tags.index((')', ')'))            
                quant_in = tags[2:idx_par-1]
                quant=0

                #nb nb 
                if len(quant_in) == 2 :
                    quant = (string_to_float(quant_in[0][0]) + string_to_frac(quant_in[1][0])) / 2
                elif len(quant_in) == 1:
                    quant = string_to_float(quant_in[0][0])

                else :
                    return None

                unit = fmt_unit(tags[idx_par-1][0])            
                return (quant, unit, ingr)


            #nb+ [unit] ing
            else :
                tag1_nb = tags[1][1] == 'CD'
                tag1_to = tags[1][0] == 'to'
                tag1_unit = fmt_unit(tags[1][0]) in units
                tag1_starts_minus = tags[1][0][0] == '-'

                #nb unit ing
                if (tag1_unit) :
                    return (string_to_float(tags[0][0]), fmt_unit(tags[1][0]), ingr)

                #nb nb ...
                elif tag1_nb : 
                    first_nb = string_to_float(tags[0][0]) + string_to_frac(tags[1][0])

                    #nb nb unit ing
                    if fmt_unit(tags[2][0]) in units :
                        return (first_nb, fmt_unit(tags[2][0]), ingr)

                    #nb nb to nb ...
                    elif tags[2][0] == 'to':

                        #nb nb to nb unit ing
                        if fmt_unit(tags[4][0]) in units :
                            return ((first_nb + string_to_float(tags[3][0])) / 2,  fmt_unit(tags[4][0]), ingr)

                        #nb nb to nb nb ...
                        elif tags[4][1] == 'CD' :

                            second_nb = string_to_float(tags[3][0]) + string_to_frac(tags[4][0])
                            average_qt = (first_nb + second_nb) / 2

                            #nb nb to nb nb unit ing
                            if fmt_unit(tags[5][0]) in units :
                                return (average_qt, fmt_unit(tags[5][0]), ingr)

                            #nb nb to nb nb ing
                            else :
                                return (average_qt, "", ingr)

                        #nb nb to nb ing
                        else :
                            return ((first_nb + string_to_float(tags[4][0])) / 2, "", ingr)

                    #nb nb ing
                    else :
                         return (first_nb, "", ingr)   



                #nb -nb ...
                elif tag1_starts_minus :
                    first_nb = (string_to_float(tags[0][0]) + string_to_float(tags[1][0][1:]))/2

                    #nb -nb unit ing
                    if fmt_unit(tags[2][0]) in units :
                        return (first_nb, fmt_unit(tags[2][0]), ingr)

                    #nb -nb ing
                    else :
                        return (first_nb, "", ingr)

                #nb to nb ...
                elif (tag1_to):
                    first_nb = string_to_float(tags[0][0])

                    #nb to nb nb ...
                    if (tags[3][1] == 'CD') :
                        second_nb = string_to_float(tags[2][0]) + string_to_frac([3][0])
                        avg_qt = (first_nb + second_nb) / 2

                        #nb to nb nb unit ing
                        if fmt_unit(tags[4][0]) in units :
                            return (avg_qt, fmt_unit(tags[4][0]), ingr)

                        #nb to nb nb ing
                        else :
                            return (avg_qt, "", ingr)

                    #nb to nb unit ing
                    elif (fmt_unit(tags[3][0]) in units) :
                        second_nb = string_to_float(tags[2][0])
                        avg_qt = (first_nb + second_nb) / 2
                        return (avg_qt, fmt_unit(tags[3][0]), ingr)

                    #nb to nb ing
                    else :
                        second_nb = string_to_float(tags[2][0])
                        avg_qt = (first_nb + second_nb) / 2
                        return (avg_qt, "", ingr)
                #nb ing 
                else :
                    return (string_to_float(tags[0][0]), "", ingr)
    except :
        return None

In [33]:
measurable_indices = []
all_extracted = []
unit_ing = collections.Counter()
measurables = 0
nb_to_try = 10000
sizes = ['large', 'medium', 'small']

for e, r in tqdm.tqdm_notebook(enumerate(recipes[:nb_to_try])) :
    
    extracted = []
    contains_immeasurable = False
    
    for i, ingredient in enumerate(r['ingredients']) :
        
        #remove sizes        
        ingredient['text'] = " ".join([c for c in ingredient['text'].split(" ") if (not c.lower().strip() in sizes)])
        
        #tag the ingredient definition
        tags = nltk.pos_tag(nltk.word_tokenize(ingredient['text']))
        
        ing_to_detect = " ".join([singularize(c) for c in det_ingr[e]['ingredients'][i]['text'].split(" ")])

        a = extract_quantity(tags, e, i)

        if a is not None and a[1] != "":
            extracted.append(a)

        else :
            if ((a is not None) \
                and (a[1] == "") \
                and (len(ingredient['text'].split(" ")) > 2) \
                and (len(ing_to_detect.split(" ")) > 0) \
                and (singularize(ingredient['text'].split(" ")[1]).strip() == ing_to_detect.split(" ")[0].strip())) :
                unit_ing.update([ing_to_detect])
            
            else :
                
                ing_is_salt = ((('salt', 'NN') in tags) or (('salt', 'NNP') in tags) or (('Salt', 'NN') in tags) or (('Salt', 'NNP') in tags)) 
                
                if not ing_is_salt :
                    contains_immeasurable = True
                elif ing_is_salt :                    
                    extracted.append((2.5, 'g', 'kosher salt'))            
            
    if not contains_immeasurable :
        all_extracted.append(extracted)
        measurable_indices.append(e)
        measurables+=1
            
            
print(str(measurables) + " recipes contained only mesurable ingredients => " + str(100*measurables/nb_to_try) + "%")
json.dump(measurable_indices, open("../generated/1m_measurable_indices.json", 'w'))
json.dump(all_extracted, open("../generated/1m_extracted_quantities.json", 'w'))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


2143 recipes contained only mesurable ingredients => 21.43%


## ingredients whose weight should be computed by hand

In [2]:
find_unit_ing = json.load(open("./../generated/find_unit_quantity.json"))
find_unit_ing = sorted([(k, find_unit_ing[k]) for k in find_unit_ing], key= lambda x : x[1], reverse=True)

### scrap quantities from some website

In [13]:
found = []

for ing in tqdm.tqdm_notebook(find_unit_ing[0:200]) :
    
    quantity = 0
    unit = ""
    
    try :
        
        url_sing = "https://hannaone.com/Recipe/weight" + str(ing[0]).replace(" ", "") +".html"
        url_plur = "https://hannaone.com/Recipe/weight" + str(ing[0]).replace(" ", "") +"s.html"
        
        try :
            req = urllib.request.Request(url_sing, headers={'User-Agent': 'Mozilla/5.0'})
            html = urllib.request.urlopen(req)
            res = bs(html.read(),"html5lib")
            trs = res.find("table").findAll("tr")
        except :
            try :
                req = urllib.request.Request(url_plur, headers={'User-Agent': 'Mozilla/5.0'})
                html = urllib.request.urlopen(req)
                res = bs(html.read(),"html5lib")
                trs = res.find("table").findAll("tr")
            except Exception as e:
                raise ValueError('The ingredient has no matching url')
        
        found_medium = False
        found_some = False
        to_append = []
        
        for tr in trs[1:] :
            td = tr.find("td")
            
            if td is not None :

                if (('large' in td.text.lower() and (not found_medium)) or (('medium' in td.text.lower()) and not found_medium)) :
                    tds = tr.findAll("td")

                    weight = tds[2].text.split("-")[-1].strip()

                    if 'g' in weight :
                        quantity  = float(weight[:-1])
                        unit = 'g'
                        to_append = (ing[0], (quantity, unit))
                        found_some = True

                    elif 'oz'in weight :
                        quantity  = float(weight[:-2])
                        unit = 'oz'
                        to_append = (ing[0], (quantity, unit))
                        found_some = True


                    if td.text.lower() == 'medium' :
                        found_medium = True  
                    
        if found_some :
            found.append(to_append)
            
            if found_medium :
                print(ing[0], ", medium : ", to_append)
            else :
                print(ing[0], ", large : ", to_append)



             
    except Exception as e: 
        pass

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

egg , medium :  ('egg', (44.0, 'g'))
onion , medium :  ('onion', (110.0, 'g'))
lemon , large :  ('lemon', (84.0, 'g'))
green onion , medium :  ('green onion', (15.0, 'g'))
tomato , large :  ('tomato', (20.0, 'g'))
orange , medium :  ('orange', (131.0, 'g'))
banana , medium :  ('banana', (118.0, 'g'))
potato , medium :  ('potato', (213.0, 'g'))
apple , medium :  ('apple', (182.0, 'g'))
cucumber , large :  ('cucumber', (35.0, 'g'))
radish , medium :  ('radish', (4.5, 'g'))
cantaloupe , medium :  ('cantaloupe', (552.0, 'g'))


### quantities found
We found 6% of the ingredients which is a poor result

In [15]:
len(found)

12

In [None]:
weights = [('egg', (44, 'g')),
 ('garlic clove', (3, 'g')),
 ('onion', (110, 'g')),
 ('lemon', (84, 'g')),
 ('bay leaf', 22146),
 ('carrot', 16888),
 ('green onion', (15, 'g')),
 ('egg yolk', 13813),
 ('egg white', 13262),
 ('red bell pepper', 10331),
 ('lime', 10260),
 ('tomato', (20, 'g')),
 ('scallion', 10112),
 ('shallot', 8168),
 ('celery rib', 7682),
 ('red onion', 7308),
 ('green bell pepper', 6646),
 ('orange', (131, 'g')),
 ('green pepper', 6230),
 ('banana', (118, 'g')),
 ('boneles skinles chicken breast', 5562),
 ('avocado', 5294),
 ('cinnamon stick', 4830),
 ('jalapeno pepper', 4723),
 ('chicken breast', 4661),
 ('red pepper', 4646),
 ('potato', (213, 'g')),
 ('apple', (182, 'g')),
 ('jalapeno', 3993),
 ('cucumber', (35, 'g')),
 ('zucchini', 3706),
 ('vanilla bean', 3436),
 ('leek', 3386),
 ('whole clove', 3214),
 ('flour tortilla', 3016),
 ('boneles skinles chicken breast half', 2905),
 ('yellow onion', 2894),
 ('plum tomato', 2889),
 ('bell pepper', 2801),
 ('corn tortilla', 2383),
 ('yellow bell pepper', 2097),
 ('clove', 1999),
 ('chicken bouillon cube', 1962),
 ('hamburger bun', 1834),
 ('spring onion', 1834),
 ('cherry tomato', 1811),
 ('mango', 1698),
 ('pork chop', 1661),
 ('rom tomato', 1653),
 ('white onion', 1630)]

### recipes with both quantities and usda id for all ingredients

In [6]:
usda_mappable_indices = []
usda_mappable = 0

for index in tqdm.tqdm_notebook(measurable_indices) :
    try :
        det_ingr[index]['valid'].index(False)
    except :
        usda_mappable += 1
        usda_mappable_indices.append(index)

print(str(usda_mappable) + " recipes contained only mappable ingredients => " + str(100*usda_mappable/len(measurable_indices)) + "%")


HBox(children=(IntProgress(value=0, max=2842), HTML(value='')))


2757 recipes contained only mappable ingredients => 97.009148486981%


### recipes with usda id for all ingredients

In [8]:
usda_no_quant_indices = []
usda_no_quant = 0

for index in tqdm.tqdm_notebook(range(len(recipes))) :
    try :
        det_ingr[index]['valid'].index(False)
    except :
        usda_no_quant += 1
        usda_no_quant_indices.append(index)

print(str(usda_no_quant) + " recipes contained mappable ingredients and no quant => " + str(100*usda_no_quant/len(recipes)) + "%")


HBox(children=(IntProgress(value=0, max=1029720), HTML(value='')))


869656 recipes contained mappable ingredients and no quant => 84.45558015771277%


### food embeddings

In [13]:
def clean_ing(ing) : return " ".join([singularize(w) for w in ing.split(" ")]).lower()

embeddings_train = []

for index in tqdm.tqdm_notebook(usda_no_quant_indices) :
    r = det_ingr[index]
    embeddings_train.append([clean_ing(ing['text']) for e, ing in enumerate(r['ingredients']) if r['valid'][e]])
    
#print(embeddings_train)

HBox(children=(IntProgress(value=0, max=869656), HTML(value='')))




Train Word2Vec model

In [17]:
start_time = time.time()
model = gensim.models.Word2Vec(embeddings_train, min_count=2)

time_after_creation = time.time()
print("creation of model :", time_after_creation - start_time, "sec.")

model.train(embeddings_train, total_examples=len(embeddings_train), epochs=10)

time_after_training = time.time()
print("model training :", time_after_training - time_after_creation, "sec.")

model.save('food_embeddings.bin')

time_after_saving = time.time()
print("model saving :", time_after_saving - time_after_training, "sec.")

creation of model :  22.808128356933594
model training :  50.59020781517029
model saving :  0.1352684497833252


Results investigation 

In [20]:
model.wv.most_similar("orange juice")

[('fresh orange juice', 0.7791991233825684),
 ('orange juice concentrate', 0.7678360939025879),
 ('frozen orange juice concentrate', 0.7533652782440186),
 ('unsweetened orange juice', 0.6837921142578125),
 ('apricot nectar', 0.549423336982727),
 ('blood orange juice', 0.5472801923751831),
 ('grapefruit peel', 0.5405464768409729),
 ('orange marmalade', 0.5188551545143127),
 ('frozen concentrated orange juice', 0.5104542374610901),
 ('mandarin orange juice', 0.4832901656627655)]

In [24]:
model.wv.similarity(w1="lamb", w2="beef")

0.5978874

In [26]:
 model.most_similar(positive=['rice', 'tomato'], negative=['pasta'], topn=1)

  """Entry point for launching an IPython kernel.


[('chopped tomato', 0.6313924193382263)]