In [1]:
import json
import nltk
import inflect
import tqdm
import collections
engine = inflect.engine()
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/luke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/luke/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
recipes = json.load(open("../data/1M/recipe1M_layers/layer1.json"))
det_ingr = json.load(open("../data/1M/det_ingr.json"))

In [3]:
def string_to_float(x) :
    try :
        x = x.split('/')
        if len(x) > 1 :            
            return (float(x[0])/float(x[1]))
        else :
            return float(x[0])
    except :
        raise ValueError("not possible to cast ", x, "to float")
    
def string_to_frac(x) :
    try :
        if '/' in x:
            return string_to_float(x)
        else :
            return(float(x[0]) / float(x[1:]))
    except :
        raise ValueError("not possible to cast ", x, "to float")
    
def singularize(x):
    ingr = engine.singular_noun(x)
    return x if (not ingr) else ingr

def fmt_unit(x) :
    return singularize(x.lower())

In [104]:
units = ["bushel", "cup", "dash", "drop", "fl." "oz", "g", "gallon", "glass",
    "kg", "liter", "ml", "ounce", "c.", "pinch", "pint", "pound", "lb", "quart",
    "scoop", "shot", "tablespoon", "teaspoon", "tsp", "tbsp"]

def extract_quantity(x, recipe_index, ingredient_index) : 
    try :
        ingr  = det_ingr[recipe_index]['ingredients'][ingredient_index]['text']
        ingr_first_word = ingr.split(" ")[0]

        if ((len(tags) >= 2) and (tags[0][1] == 'CD')) :

            #nb (nb+ unit) ing
            if ((tags[1][0] == '(') and (len(tags)>6)):
                idx_par = tags.index((')', ')'))            
                quant_in = tags[2:idx_par-1]
                quant=0

                #nb nb 
                if len(quant_in) == 2 :
                    quant = (string_to_float(quant_in[0][0]) + string_to_frac(quant_in[1][0])) / 2
                elif len(quant_in) == 1:
                    quant = string_to_float(quant_in[0][0])

                else :
                    return None

                unit = fmt_unit(tags[idx_par-1][0])            
                return (quant, unit, ingr)


            #nb+ [unit] ing
            else :
                tag1_nb = tags[1][1] == 'CD'
                tag1_to = tags[1][0] == 'to'
                tag1_unit = fmt_unit(tags[1][0]) in units
                tag1_starts_minus = tags[1][0][0] == '-'

                #nb unit ing
                if (tag1_unit) :
                    return (string_to_float(tags[0][0]), fmt_unit(tags[1][0]), ingr)

                #nb nb ...
                elif tag1_nb : 
                    first_nb = string_to_float(tags[0][0]) + string_to_frac(tags[1][0])

                    #nb nb unit ing
                    if fmt_unit(tags[2][0]) in units :
                        return (first_nb, fmt_unit(tags[2][0]), ingr)

                    #nb nb to nb ...
                    elif tags[2][0] == 'to':

                        #nb nb to nb unit ing
                        if fmt_unit(tags[4][0]) in units :
                            return ((first_nb + string_to_float(tags[3][0])) / 2,  fmt_unit(tags[4][0]), ingr)

                        #nb nb to nb nb ...
                        elif tags[4][1] == 'CD' :

                            second_nb = string_to_float(tags[3][0]) + string_to_frac(tags[4][0])
                            average_qt = (first_nb + second_nb) / 2

                            #nb nb to nb nb unit ing
                            if fmt_unit(tags[5][0]) in units :
                                return (average_qt, fmt_unit(tags[5][0]), ingr)

                            #nb nb to nb nb ing
                            else :
                                return (average_qt, "", ingr)

                        #nb nb to nb ing
                        else :
                            return ((first_nb + string_to_float(tags[4][0])) / 2, "", ingr)

                    #nb nb ing
                    else :
                         return (first_nb, "", ingr)   



                #nb -nb ...
                elif tag1_starts_minus :
                    first_nb = (string_to_float(tags[0][0]) + string_to_float(tags[1][0][1:]))/2

                    #nb -nb unit ing
                    if fmt_unit(tags[2][0]) in units :
                        return (first_nb, fmt_unit(tags[2][0]), ingr)

                    #nb -nb ing
                    else :
                        return (first_nb, "", ingr)

                #nb to nb ...
                elif (tag1_to):
                    first_nb = string_to_float(tags[0][0])

                    #nb to nb nb ...
                    if (tags[3][1] == 'CD') :
                        second_nb = string_to_float(tags[2][0]) + string_to_frac([3][0])
                        avg_qt = (first_nb + second_nb) / 2

                        #nb to nb nb unit ing
                        if fmt_unit(tags[4][0]) in units :
                            return (avg_qt, fmt_unit(tags[4][0]), ingr)

                        #nb to nb nb ing
                        else :
                            return (avg_qt, "", ingr)

                    #nb to nb unit ing
                    elif (fmt_unit(tags[3][0]) in units) :
                        second_nb = string_to_float(tags[2][0])
                        avg_qt = (first_nb + second_nb) / 2
                        return (avg_qt, fmt_unit(tags[3][0]), ingr)

                    #nb to nb ing
                    else :
                        second_nb = string_to_float(tags[2][0])
                        avg_qt = (first_nb + second_nb) / 2
                        return (avg_qt, "", ingr)
                #nb ing 
                else :
                    return (string_to_float(tags[0][0]), "", ingr)
    except :
        return None

In [120]:
measurable_indices = []
all_extracted = []
unit_ing = collections.Counter()
measurables = 0
nb_to_try = 10000

for e, r in tqdm.tqdm_notebook(enumerate(recipes[:nb_to_try])) :
    
    extracted = []
    contains_immeasurable = False
    
    for i, ingredient in enumerate(r['ingredients']) :
        tags = nltk.pos_tag(nltk.word_tokenize(ingredient['text']))
        a = extract_quantity(tags, e, i)
        ing_to_detect = " ".join(singularize(c) for c in det_ingr[e]['ingredients'][i]['text'].split(" "))
        
        if a is not None and a[1] != "":
            extracted.append(a)
            
        else :
            if a is not None and a[1] == "" and singularize(ingredient['text'].split(" ")[1]) == ing_to_detect.split(" ")[0] :
                unit_ing.update([ing_to_detect])
            else :
                contains_immeasurable = True
            
    if not contains_immeasurable :
        all_extracted.append(extracted)
        measurable_indices.append(e)
        measurables+=1
            
            
print(str(measurables) + " recipes contained only mesurable ingredients => " + str(100*measurables/nb_to_try) + "%")


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


2242 recipes contained only mesurable ingredients => 22.42%


In [83]:
usda_mappable_indices = []
usda_mappable = 0

for index in tqdm.tqdm_notebook(measurable_indices) :
    try :
        det_ingr[index]['valid'].index(False)
    except :
        usda_mappable += 1
        usda_mappable_indices.append(index)

print(str(usda_mappable) + " recipes contained only mappable ingredients => " + str(100*usda_mappable/len(measurable_indices)) + "%")


HBox(children=(IntProgress(value=0, max=115807), HTML(value='')))

107977 recipes contained only mappable ingredients => 93.23875068001071%
