In [1]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
!python3 -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [3]:
import recipe as r

def find_ingredient(name: str, ingredients: list[r.Ingredient]) -> list[int]:
    '''Finds the list of indices of the ingredients possibly being referenced'''

    ingr_inds = []
    components = name.split()

    # Remove any determiners
    if components[0] in ['the', 'a', 'an']:
        del components[0]
        name = ' '.join(components)

    # If name is empty, return
    if not name:
        return ingr_inds
    
    # Initialize confidence to 1/2 of number of words
    max_confidence = len(components) / 2

    for i in range(len(ingredients)):
        if name == ingredients[i].name:
            # Return if an exact match
            return [i]
        elif name in ingredients[i].name:
            # If a substring, add to list and limit search to other substrs
            ingr_inds.append(i)
            max_confidence = len(components) + 1
        elif max_confidence <= len(components):
            # Check how many words match and add to list if == current max or
            # reset list to just it + update max if >
            confidence = 0
            for wd in components:
                if wd in ingredients[i].name:
                    confidence += 1
            if confidence > max_confidence:
                ingr_inds = [i]
            elif confidence == max_confidence:
                ingr_inds.append(i)
    
    return ingr_inds

In [4]:
import re
import util as u

def parse_and_add_step(instr: str, recipe: r.Recipe) -> None:
    '''Parses an instruction into maybe more steps and adds to recipe.'''

    # Split recipe instruction into "sentences" for SpaCy parser
    steps = re.split(r'(?:\.|;|, then)\s+', instr)

    for text in steps:

        # Skip if empty
        if not text:
            continue

        # Form a nice sentence for the step text, and initialize step
        text = ''.join([text[0].upper(), text[1:len(text)]])
        if not text.endswith('.'):
            text = ''.join([text, '.'])
        if len(recipe.steps) > 0:
            step: r.Step = r.Step(text, recipe.steps[-1].state)
        else:
            step: r.Step = r.Step(text, r.IngredientState(recipe.ingredients))

        # Uncapitalize the first letter. This prevents SpaCy from reading the
        # first word as a proper noun (imperative sentences are less common in
        # its dataset).
        text = ''.join([text[0].lower(), text[1:len(text)]])

        # Based on the dependency + part of speech tagging, extract necessary
        # information.
        doc = u.nlp(text)
        for (i, token) in enumerate(doc):

            # If the parser interpreted an imperative sentence as an NP,
            # correct it.
            if token.dep_ == 'ROOT' and token.head.pos_ == 'NOUN':
                for j in range(i,-1,-1):
                    if j == 0 or \
                        (j > 0 and doc[j-1].dep_ == 'punct'):
                        doc[j].dep_ = 'ROOT'
                        token.dep_ = 'dobj'
                        step.methods.append(doc[j].text)
                        break

            # Extract methods
            if token.dep_ == 'ROOT':
                step.methods.append(token.text)
            elif token.dep_ == 'conj':
                if token.head.text in step.methods:
                    step.methods.append(token.text)

        # Extract ingredients, tools, and temps
        from_prev = ''
        for chunk in doc.noun_chunks:

            # If the root verb is mistakenly in the chunk, remove it.
            name = chunk.text
            if chunk.root.dep_ == 'ROOT' and step.methods[0] in name:
                name = name.partition(step.methods[0])[2]
                if not name:
                    continue
                name = name[1:-1]
            if from_prev:
                name = ' '.join([from_prev, name])
                from_prev = ''
            
            # If the noun chunk is not a dobj, pobj, or conj, ignore it.
            if chunk.root.dep_ not in ['dobj', 'pobj', 'conj', 'ROOT', 'appos']:
                continue
            
            # Check the type of noun
            ntypes = u.NounType.from_str(chunk.root.text)
            if u.NounType.MEASURE in ntypes and \
                doc[chunk.start].ent_type_ == 'CARDINAL':
                from_prev = chunk.text
                continue
            if u.NounType.TOOL in ntypes:
                step.tools.append(chunk.text)
            elif u.NounType.TEMPERATURE in ntypes or \
                (chunk.end-chunk.start > 2 and doc[chunk.end-2].text == 'degrees'):
                step.temps.append(chunk.text.strip('().:,'))
            else:
                # Check if the referenced noun is an ingredient.
                ref_ingr = r.Ingredient.from_str(name)
                if ref_ingr.name:
                    ingr_inds = find_ingredient(ref_ingr.name, step.state.remaining)
                    # Assume ambiguous ingredient means inclusive
                    offset = 0
                    for i in ingr_inds:
                        ingr = r.Ingredient(
                            ref_ingr.name if len(ingr_inds) == 1 \
                                else step.state.remaining[i-offset].name,
                            ref_ingr.quantity if ref_ingr.quantity \
                                else step.state.remaining[i-offset].quantity,
                            ref_ingr.unit if ref_ingr.unit \
                                else step.state.remaining[i-offset].unit)
                        if ingr.quantity and \
                            step.state.remaining[i-offset].quantity <= ingr.quantity:
                            del step.state.remaining[i-offset]
                            offset += 1
                        elif ingr.quantity:
                            step.state.remaining[i-offset].quantity -= ingr.quantity
                        step.ingredients.append(ingr)

        # Extract times
        for ent in doc.ents:
            if ent.label_ == 'TIME' or ent.label_ == 'DATE':
                step.times.append(ent.text)

        # Save step to recipe
        recipe.steps.append(step)

In [5]:
from html.parser import HTMLParser

class RecipeHTMLParser(HTMLParser):
    '''HTML parser that handles recipes'''

    def __init__(self, source: u.RecipeSource, convert_charrefs: bool = True) -> None:
        # Initialize class, setting recipe to empty
        self.source = source
        self.recipe = r.Recipe()
        self.current_tag = u.HTMLTag.UNKNOWN
        self.current_section = u.HTMLTag.UNKNOWN
        super().__init__(convert_charrefs=convert_charrefs)
    
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        # Save the current tag
        self.current_tag = u.HTMLTag.from_tag(self.source, tag, attrs)
        match self.current_tag:
            case u.HTMLTag.INGREDIENTS_LIST:
                self.current_section = u.HTMLTag.INGREDIENTS_LIST
            case u.HTMLTag.INGREDIENT:
                if self.current_section == u.HTMLTag.INGREDIENTS_LIST:
                    self.ingredient = r.Ingredient()
            case u.HTMLTag.STEPS_LIST:
                self.current_section = u.HTMLTag.STEPS_LIST
        return super().handle_starttag(tag, attrs)
    
    def handle_data(self, data: str) -> None:
        # Handle text between tags as appropriate
        match self.current_tag:
            case u.HTMLTag.TITLE:
                self.recipe.title = data.strip()
            case u.HTMLTag.OVERVIEW_LABEL:
                self.label = data.lower().strip(':,.! \n\t')
            case u.HTMLTag.OVERVIEW_TEXT:
                self.recipe.other[self.label] = data.strip()
            case u.HTMLTag.INGREDIENT_QUANTITY:
                if self.current_section == u.HTMLTag.INGREDIENTS_LIST:
                    self.ingredient.quantity = u.str_to_fraction(data.strip())
            case u.HTMLTag.INGREDIENT_UNIT:
                if self.current_section == u.HTMLTag.INGREDIENTS_LIST:
                    self.ingredient.unit = data.strip()
            case u.HTMLTag.INGREDIENT_NAME:
                if self.current_section == u.HTMLTag.INGREDIENTS_LIST:
                    self.ingredient.name = data.strip()
                    self.recipe.ingredients.append(self.ingredient)
            case u.HTMLTag.STEP:
                if self.current_section == u.HTMLTag.STEPS_LIST:
                    parse_and_add_step(data.strip(), self.recipe)
        return super().handle_data(data)

    def handle_endtag(self, tag: str) -> None:
        # Reset tag
        self.current_tag = u.HTMLTag.UNKNOWN
        return super().handle_endtag(tag)

In [6]:
import re
import requests

def get_recipe_from_url(url: str) -> r.Recipe | None:
    '''Retrieves the text of a recipe from a given URL'''

    # Find recipe source; return None if unsupported
    source = u.RecipeSource.from_url(url)
    if source == u.RecipeSource.UNKNOWN:
        return None

    # Add appropriate HTTPS tag if not there
    if not re.match(r'https://www\.', url):
        if re.match(r'www\.', url):
            url = ''.join(['https://', url])
        else:
            url = ''.join(['https://www.', url])
    
    # Get the recipe from the page
    with requests.get(url) as f:
        parser = RecipeHTMLParser(source)
        parser.feed(f.text)
        return parser.recipe

In [7]:
recipe = get_recipe_from_url("https://www.allrecipes.com/recipe/19644/moussaka/")
# recipe = get_recipe_from_url("https://www.allrecipes.com/recipe/218091/classic-and-simple-meat-lasagna/")
# recipe = get_recipe_from_url("https://www.allrecipes.com/recipe/230238/gingerbread-men-cookies/")
print(recipe.title, '\n')
for ingr in recipe.ingredients:
    print(u.fraction_to_str(ingr.quantity) if ingr.quantity else '',
          ' ' if ingr.quantity else '', ingr.unit if ingr.unit else '',
          ' ' if ingr.unit else '', ingr.name, sep='')
print()
for step in recipe.steps:
    print(step.text, step.methods, [(ingr.name, ingr.quantity) for ingr in step.ingredients],
          step.tools, step.times, step.temps)

Moussaka 

3 eggplants, peeled and cut lengthwise into 1/2 inch thick slices
salt to taste
1/4 cup olive oil
1 tablespoon butter
1 pound lean ground beef
2 onions, chopped
1 clove garlic, minced
ground black pepper to taste
2 tablespoons dried parsley
1/2 teaspoon fines herbs
1/4 teaspoon ground cinnamon
1/2 teaspoon ground nutmeg, divided
1 (8 ounce) can tomato sauce
1/2 cup red wine
1 egg, beaten
4 cups milk
1/2 cup butter
6 tablespoons all-purpose flour
ground white pepper, to taste
1 1/2 cups freshly grated Parmesan cheese

Lay eggplant slices on paper towels. ['lay'] [('eggplant slices', Fraction(3, 1))] [] [] []
Sprinkle lightly with salt. ['sprinkle'] [('salt', None)] [] [] []
Let sit for 30 minutes to draw out moisture. ['sit'] [] [] ['30 minutes'] []
Pat dry with paper towels. ['dry'] [] [] [] []
Warm olive oil in a skillet over high heat. ['warm'] [('warm olive oil', Fraction(1, 4))] ['a skillet'] [] ['high heat']
Fry eggplant until browned, 2 to 3 minutes per side. ['fry'] [

In [8]:
# text = "lay eggplant slices on paper towels"
text = "eggplant slices"
# text = "cook and stir until beef is browned, 8 to 10 minutes"
# text = "scald milk in a saucepan over medium heat"
# text = "meanwhile, place ground beef, garlic, oregano, garlic powder, salt, and black pepper in a large skillet over medium heat"
# text = "add lasagna noodles and cook for 10 minutes or until al dente"
# text = "season béchamel sauce with salt and white pepper"
# text = "serve hot and enjoy!"
# text = "bake in the preheated oven until edges of cookies are set and just begin to brown, 8 to 10 minutes"
# text = "pour béchamel sauce on top and sprinkle with remaining 1/4 teaspoon nutmeg"
# text = "cover with remaining eggplant and sprinkle another 1/2 cup cheese on top"
# text = "add parsley, fines herbs, cinnamon, and 1/4 teaspoon nutmeg"

doc = u.nlp(text)
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children], token.ent_type_)
print()
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text, chunk.start, chunk.end, chunk.root.i)
print()
for ent in doc.ents:
    print(ent.text, ent.label_)

eggplant NOUN compound slices NOUN [] 
slices NOUN ROOT slices NOUN [eggplant] 

eggplant slices slices ROOT slices 0 2 1



In [9]:
ingr = r.Ingredient.from_str("eggplant slices")
if ingr.name:
    print(u.fraction_to_str(ingr.quantity) if ingr.quantity else '-',
          ingr.unit, ingr.name)

- None eggplant slices


In [11]:
from nltk.corpus import wordnet as wn

def print_hypernym_paths(noun: str):
    mix_sets = wn.synsets(noun, wn.NOUN)
    print(mix_sets)
    for s in mix_sets:
        print(s.hypernym_paths())
    print()

def print_verb_paths(verb: str):
    sets = wn.synsets(verb, wn.VERB)
    print(sets)
    for s in sets:
        print(s.hypernym_paths())
    print()

print_verb_paths("scald")
print_verb_paths("bake")
print_verb_paths("saute")
print(u.NounType.from_str("season"))
print(u.NounType.from_str("tin"))
print(u.NounType.from_str("mixer"))
print(u.NounType.from_str("men"))

[Synset('blister.v.02'), Synset('scald.v.02'), Synset('scald.v.03'), Synset('scald.v.04')]
[[Synset('express.v.02'), Synset('state.v.01'), Synset('note.v.01'), Synset('comment.v.01'), Synset('knock.v.06'), Synset('attack.v.02'), Synset('blister.v.02')]]
[[Synset('change.v.01'), Synset('affect.v.01'), Synset('process.v.01'), Synset('scald.v.02')]]
[[Synset('change.v.01'), Synset('heat.v.01'), Synset('scald.v.03')]]
[[Synset('change.v.01'), Synset('damage.v.01'), Synset('burn.v.15'), Synset('scald.v.04')]]

[Synset('bake.v.01'), Synset('bake.v.02'), Synset('broil.v.02'), Synset('bake.v.04')]
[[Synset('change.v.02'), Synset('change_integrity.v.01'), Synset('cook.v.03'), Synset('bake.v.01')]]
[[Synset('make.v.03'), Synset('create_from_raw_material.v.01'), Synset('bake.v.02')]]
[[Synset('change.v.01'), Synset('heat.v.01'), Synset('broil.v.02')]]
[[Synset('be.v.01'), Synset('bake.v.04')]]

[Synset('saute.v.01')]
[[Synset('change.v.02'), Synset('change_integrity.v.01'), Synset('cook.v.03'), S