In [1]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
!python3 -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [3]:
from fractions import Fraction
import unicodedata

def str_to_fraction(data: str):
    sum = Fraction()
    table = str.maketrans({u'⁄': '/'})
    data = unicodedata.normalize('NFKD', data).translate(table).split()
    for val in data:
        sum += Fraction(val)
    return sum

def fraction_to_str(frac: Fraction):
    if frac.denominator == 1:
        return str(frac.numerator)
    elif frac.numerator >= frac.denominator:
        return ' '.join([str(frac.numerator // frac.denominator),
                         str(Fraction(frac.numerator % frac.denominator,
                                      frac.denominator))])
    else:
        return str(frac)

In [4]:
class Ingredient:
    '''Struct holding ingredient information'''

    def __init__(self, name: str | None = None,
                 quantity: Fraction | None = None,
                 unit: str | None = None):
        self.name = name
        '''Name of the ingredient, e.g. salt'''
        self.quantity = quantity
        '''Quantity of the ingredient, e.g. 1/2'''
        self.unit = unit
        '''Unit of the ingredient, e.g. tsp'''

class IntermediateIngredient:
    '''Struct holding intermediate ingredient information, e.g. dough'''

    def __init__(self, ingredients: list[Ingredient]):
        self.name: str | None = None
        '''Name of the intermediate ingredient, if assigned, e.g. dough'''
        self.ingredients = ingredients
        '''Original ingredients involved, e.g. [flour, eggs]'''

class IngredientState:
    '''Struct holding the current state of ingredients at a given step'''

    def __init__(self, remaining: list[Ingredient],
                 intermediate: list[IntermediateIngredient] = [],
                 focus: int = -1):
        self.remaining: list[Ingredient] = remaining.copy()
        '''Remaining unused ingredients'''
        self.intermediate: list[IntermediateIngredient] = intermediate.copy()
        '''Any intermediate collections of ingredients'''
        self.focus = focus
        '''Index of currently referenced intermediate ingredient'''

class Step:
    '''Struct holding step information'''

    def __init__(self, text: str, init_state: IngredientState):
        self.text: str = text
        '''Text associated with the step'''
        self.ingredients: list[Ingredient] = []
        '''List of ingredients used in this step'''
        self.state: IngredientState = init_state
        '''State of the ingredients at this step'''
        self.tools: list[str] = []
        '''Tools mentioned in this step'''
        self.methods: list[str] = []
        '''Methods mentioned in this step'''
        self.times: list[str] = []
        '''Times mentioned in this step'''
        self.temps: list[str] = []
        '''Temperatures / measures of "doneness" mentioned in this step'''
    
class Recipe:
    '''Struct holding recipe information'''

    def __init__(self):
        self.title: str = ""
        '''Title of the recipe'''
        self.ingredients: list[Ingredient] = []
        '''List of ingredients used in the recipe'''
        self.tools: list[str] = []
        '''List of tools used in the recipe'''
        self.steps: list[Step] = []
        '''List of recipe steps'''
        self.other: dict[str, str] = {}
        '''Other miscellaneous recipe information'''

In [20]:
def find_ingredient(name: str, ingredients: list[Ingredient]) -> list[int]:
    '''Finds the list of indices of the ingredients possibly being referenced'''

    ingr_inds = []
    components = name.split()

    # Remove any determiners
    if components[0] in ['the', 'a', 'an']:
        del components[0]
        name = ' '.join(components)

    # If name is empty, return
    if not name:
        return ingr_inds
    
    # Initialize confidence to 1/2 of number of words
    max_confidence = len(components) / 2

    for i in range(len(ingredients)):
        if name == ingredients[i].name:
            # Return if an exact match
            return [i]
        elif name in ingredients[i].name:
            # If a substring, add to list and limit search to other substrs
            ingr_inds.append(i)
            max_confidence = len(components) + 1
        elif max_confidence <= len(components):
            # Check how many words match and add to list if == current max or
            # reset list to just it + update max if >
            confidence = 0
            for wd in components:
                if wd in ingredients[i].name:
                    confidence += 1
            if confidence > max_confidence:
                ingr_inds = [i]
            elif confidence == max_confidence:
                ingr_inds.append(i)
    
    return ingr_inds

In [None]:
import re
import spacy
nlp = spacy.load("en_core_web_md")

def parse_and_add_step(instr: str, recipe: Recipe) -> None:
    '''Parses an instruction into maybe more steps and adds to recipe.'''

    # Split recipe instruction into "sentences" for SpaCy parser
    steps = re.split(r'(?:\.|;|, then)\s+', instr)

    for text in steps:

        # Skip if empty
        if not text:
            continue

        # Form a nice sentence for the step text, and initialize step
        text = ''.join([text[0].upper(), text[1:len(text)]])
        if not text.endswith('.'):
            text = ''.join([text, '.'])
        if len(recipe.steps) > 0:
            step: Step = Step(text, recipe.steps[-1].state)
        else:
            step: Step = Step(text, IngredientState(recipe.ingredients))

        # Uncapitalize the first letter. This prevents SpaCy from reading the
        # first word as a proper noun (imperative sentences are less common in
        # its dataset).
        text = ''.join([text[0].lower(), text[1:len(text)]])

        # Based on the dependency + part of speech tagging, extract necessary
        # information.
        doc = nlp(text)
        for (i, token) in enumerate(doc):

            # If the parser interpreted an imperative sentence as an NP,
            # correct it.
            if token.dep_ == 'ROOT' and token.head.pos_ == 'NOUN':
                for j in range(i,-1,-1):
                    if j == 0 or \
                        (j > 0 and doc[j-1].dep_ == 'punct'):
                        doc[j].dep_ = 'ROOT'
                        token.dep_ = 'dobj'
                        step.methods.append(doc[j].text)
                        break

            # Extract methods
            if token.dep_ == 'ROOT':
                step.methods.append(token.text)
            elif token.dep_ == 'conj':
                if token.head.text in step.methods:
                    step.methods.append(token.text)

        # Extract ingredients
        for chunk in doc.noun_chunks:

            # If the root verb is mistakenly in the chunk, remove it.
            name = chunk.text
            if chunk.root.dep_ == 'ROOT' and step.methods[0] in name:
                name = name.partition(chunk.root.text)[2]
                if not name:
                    continue
                name = name[1:-1]
            
            # If the noun chunk is not a dobj, pobj, or conj, ignore it.
            if chunk.root.dep_ not in ['dobj', 'pobj', 'conj']:
                continue

            # Check if the referenced noun is an ingredient.
            ingr_inds = find_ingredient(name, step.state.remaining)
            if ingr_inds:
                # Assume ambiguous ingredient means inclusive
                offset = 0
                for i in ingr_inds:
                    ingr = Ingredient(step.state.remaining[i-offset].name,
                                      step.state.remaining[i-offset].quantity,
                                      step.state.remaining[i-offset].unit)
                    del step.state.remaining[i-offset]
                    offset += 1
                    step.ingredients.append(ingr)

        # Extract times
        for ent in doc.ents:
            if ent.label_ == 'TIME' or ent.label_ == 'DATE':
                step.times.append(ent.text)

        # Save step to recipe
        recipe.steps.append(step)

In [7]:
from html.parser import HTMLParser
from enums import RecipeSource, HTMLTag

class RecipeHTMLParser(HTMLParser):
    '''HTML parser that handles recipes'''

    def __init__(self, source: RecipeSource, convert_charrefs: bool = True) -> None:
        # Initialize class, setting recipe to empty
        self.source = source
        self.recipe = Recipe()
        self.current_tag = HTMLTag.UNKNOWN
        self.current_section = HTMLTag.UNKNOWN
        super().__init__(convert_charrefs=convert_charrefs)
    
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        # Save the current tag
        self.current_tag = HTMLTag.from_tag(self.source, tag, attrs)
        match self.current_tag:
            case HTMLTag.INGREDIENTS_LIST:
                self.current_section = HTMLTag.INGREDIENTS_LIST
            case HTMLTag.INGREDIENT:
                if self.current_section == HTMLTag.INGREDIENTS_LIST:
                    self.ingredient = Ingredient()
            case HTMLTag.STEPS_LIST:
                self.current_section = HTMLTag.STEPS_LIST
        return super().handle_starttag(tag, attrs)
    
    def handle_data(self, data: str) -> None:
        # Handle text between tags as appropriate
        match self.current_tag:
            case HTMLTag.TITLE:
                self.recipe.title = data.strip()
            case HTMLTag.OVERVIEW_LABEL:
                self.label = data.lower().strip(':,.! \n\t')
            case HTMLTag.OVERVIEW_TEXT:
                self.recipe.other[self.label] = data.strip()
            case HTMLTag.INGREDIENT_QUANTITY:
                if self.current_section == HTMLTag.INGREDIENTS_LIST:
                    self.ingredient.quantity = str_to_fraction(data.strip())
            case HTMLTag.INGREDIENT_UNIT:
                if self.current_section == HTMLTag.INGREDIENTS_LIST:
                    self.ingredient.unit = data.strip()
            case HTMLTag.INGREDIENT_NAME:
                if self.current_section == HTMLTag.INGREDIENTS_LIST:
                    self.ingredient.name = data.strip()
                    self.recipe.ingredients.append(self.ingredient)
            case HTMLTag.STEP:
                if self.current_section == HTMLTag.STEPS_LIST:
                    parse_and_add_step(data.strip(), self.recipe)
        return super().handle_data(data)

    def handle_endtag(self, tag: str) -> None:
        # Reset tag
        self.current_tag = HTMLTag.UNKNOWN
        return super().handle_endtag(tag)

In [8]:
import re
import requests

def get_recipe_from_url(url: str) -> Recipe | None:
    '''Retrieves the text of a recipe from a given URL'''

    # Find recipe source; return None if unsupported
    source = RecipeSource.from_url(url)
    if source == RecipeSource.UNKNOWN:
        return None

    # Add appropriate HTTPS tag if not there
    if not re.match(r'https://www\.', url):
        if re.match(r'www\.', url):
            url = ''.join(['https://', url])
        else:
            url = ''.join(['https://www.', url])
    
    # Get the recipe from the page
    with requests.get(url) as f:
        parser = RecipeHTMLParser(source)
        parser.feed(f.text)
        return parser.recipe

In [21]:
recipe = get_recipe_from_url("https://www.allrecipes.com/recipe/19644/moussaka/")
# recipe = get_recipe_from_url("https://www.allrecipes.com/recipe/218091/classic-and-simple-meat-lasagna/")
# recipe = get_recipe_from_url("https://www.allrecipes.com/recipe/230238/gingerbread-men-cookies/")
print(recipe.title, '\n')
for ingr in recipe.ingredients:
    print(ingr.quantity if ingr.quantity else '', ' ' if ingr.quantity else '',
          ingr.unit if ingr.unit else '', ' ' if ingr.unit else '',
          ingr.name, sep='')
print()
for step in recipe.steps:
    print(step.text, step.methods, [ingr.name for ingr in step.ingredients],
          step.times)

Moussaka 

3 eggplants, peeled and cut lengthwise into 1/2 inch thick slices
salt to taste
1/4 cup olive oil
1 tablespoon butter
1 pound lean ground beef
2 onions, chopped
1 clove garlic, minced
ground black pepper to taste
2 tablespoons dried parsley
1/2 teaspoon fines herbs
1/4 teaspoon ground cinnamon
1/2 teaspoon ground nutmeg, divided
1 (8 ounce) can tomato sauce
1/2 cup red wine
1 egg, beaten
4 cups milk
1/2 cup butter
6 tablespoons all-purpose flour
ground white pepper, to taste
3/2 cups freshly grated Parmesan cheese

Lay eggplant slices on paper towels. ['lay'] ['eggplants, peeled and cut lengthwise into 1/2 inch thick slices'] []
Sprinkle lightly with salt. ['sprinkle'] ['salt to taste'] []
Let sit for 30 minutes to draw out moisture. ['sit'] [] ['30 minutes']
Pat dry with paper towels. ['dry'] [] []
Warm olive oil in a skillet over high heat. ['warm'] ['olive oil'] []
Fry eggplant until browned, 2 to 3 minutes per side. ['fry'] [] ['2 to 3 minutes']
Drain on paper towels. ['

In [22]:
import spacy
nlp = spacy.load("en_core_web_md")

# text = "lay eggplant slices on paper towels"
# text = "sprinkle lightly with salt."
# text = "cook and stir until beef is browned, 8 to 10 minutes"
# text = "stir in ground beef, onions, and garlic"
# text = "meanwhile, place ground beef, garlic, oregano, garlic powder, salt, and black pepper in a large skillet over medium heat"
# text = "add lasagna noodles and cook for 10 minutes or until al dente"
# text = "preheat the oven to 350 degrees F (175 degrees C)"
# text = "season béchamel sauce with salt and white pepper"
# text = "serve hot and enjoy!"
# text = "bake in the preheated oven until edges of cookies are set and just begin to brown, 8 to 10 minutes"
# text = "pour béchamel sauce on top and sprinkle with remaining 1/4 teaspoon nutmeg"
text = "add parsley, fines herbs, cinnamon, and 1/4 teaspoon nutmeg"

doc = nlp(text)
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children], token.ent_type_)
print()
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)
print()
for ent in doc.ents:
    print(ent.text, ent.label_)

add VERB ROOT add VERB [parsley, herbs] 
parsley NOUN dobj add VERB [,] 
, PUNCT punct parsley NOUN [] 
fines NOUN compound herbs NOUN [] 
herbs NOUN dobj add VERB [fines, ,, cinnamon] 
, PUNCT punct herbs NOUN [] 
cinnamon NOUN conj herbs NOUN [,, and, nutmeg] 
, PUNCT punct cinnamon NOUN [] 
and CCONJ cc cinnamon NOUN [] 
1/4 NUM nummod nutmeg NOUN [] CARDINAL
teaspoon NOUN compound nutmeg NOUN [] 
nutmeg NOUN conj cinnamon NOUN [1/4, teaspoon] 

parsley parsley dobj add
fines herbs herbs dobj add
cinnamon cinnamon conj herbs
1/4 teaspoon nutmeg nutmeg conj cinnamon

1/4 CARDINAL
