In [1]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
!python3 -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [3]:
from fractions import Fraction
import unicodedata

def str_to_fraction(data: str):
    sum = Fraction()
    table = str.maketrans({u'⁄': '/'})
    data = unicodedata.normalize('NFKD', data).translate(table).split()
    for val in data:
        sum += Fraction(val)
    return sum

def fraction_to_str(frac: Fraction):
    if frac.denominator == 1:
        return str(frac.numerator)
    elif frac.numerator >= frac.denominator:
        return ' '.join([str(frac.numerator // frac.denominator),
                         str(Fraction(frac.numerator % frac.denominator,
                                      frac.denominator))])
    else:
        return str(frac)

In [4]:
class Ingredient:
    '''Struct holding ingredient information'''

    def __init__(self, name: str | None = None,
                 quantity: Fraction | None = None,
                 unit: str | None = None):
        self.name = name
        self.quantity = quantity
        self.unit = unit

class Step:
    '''Struct holding step information'''

    def __init__(self, text: str):
        self.text: str = text
        self.ingredients: list[tuple[Ingredient, int]] = []
        self.tools: list[str] = []
        self.methods: list[str] = []
        self.times: list[str] = []
    
class Recipe:
    '''Struct holding recipe information'''

    def __init__(self):
        self.title: str = ""
        self.ingredients: list[Ingredient] = []
        self.steps: list[Step] = []
        self.other: dict[str, str] = {}

In [5]:
import re
import spacy
nlp = spacy.load("en_core_web_md")

def parse_and_add_step(instr: str, recipe: Recipe) -> None:
    '''Parses an instruction into maybe more steps and adds to recipe.'''

    # Split recipe instruction into "sentences" for SpaCy parser
    steps = re.split(r'(?:\.|;|, then)\s+', instr)

    for text in steps:

        # Skip if empty
        if not text:
            continue

        # Form a nice sentence for the step text, and initialize step
        text = ''.join([text[0].upper(), text[1:len(text)]])
        if not text.endswith('.'):
            text = ''.join([text, '.'])
        step = Step(text)

        # Uncapitalize the first letter. This prevents SpaCy from reading the
        # first word as a proper noun (imperative sentences are less common in
        # its dataset).
        text = ''.join([text[0].lower(), text[1:len(text)]])

        # Based on the dependency + part of speech tagging, extract necessary
        # information.
        doc = nlp(text)
        for (i, token) in enumerate(doc):

            # If the parser didn't interpret this as an imperative sentence,
            # correct it.
            if token.dep_ == 'ROOT' and token.head.pos_ == 'NOUN':
                for j in range(i,-1,-1):
                    if j == 0 or \
                        (j > 0 and doc[j-1].dep_ == 'punct'):
                        doc[j].dep_ = 'ROOT'
                        token.dep_ = 'dobj'
                        step.methods.append(doc[j].text)
                        break

            # Perform extraction
            dep = token.dep_
            match dep:
                case 'ROOT':
                    step.methods.append(token.text)
                case 'conj':
                    if token.head.pos_ == 'VERB':
                        step.methods.append(token.text)

        # Save step to recipe
        recipe.steps.append(step)

In [6]:
from html.parser import HTMLParser
from enums import RecipeSource, HTMLTag

class RecipeHTMLParser(HTMLParser):
    '''HTML parser that handles recipes'''

    def __init__(self, source: RecipeSource, convert_charrefs: bool = True) -> None:
        # Initialize class, setting recipe to empty
        self.source = source
        self.recipe = Recipe()
        self.current_tag = HTMLTag.UNKNOWN
        self.current_section = HTMLTag.UNKNOWN
        super().__init__(convert_charrefs=convert_charrefs)
    
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        # Save the current tag
        self.current_tag = HTMLTag.from_tag(self.source, tag, attrs)
        match self.current_tag:
            case HTMLTag.INGREDIENTS_LIST:
                self.current_section = HTMLTag.INGREDIENTS_LIST
            case HTMLTag.INGREDIENT:
                if self.current_section == HTMLTag.INGREDIENTS_LIST:
                    self.ingredient = Ingredient()
            case HTMLTag.STEPS_LIST:
                self.current_section = HTMLTag.STEPS_LIST
        return super().handle_starttag(tag, attrs)
    
    def handle_data(self, data: str) -> None:
        # Handle text between tags as appropriate
        match self.current_tag:
            case HTMLTag.TITLE:
                self.recipe.title = data.strip()
            case HTMLTag.OVERVIEW_LABEL:
                self.label = data.lower().strip(':,.! \n\t')
            case HTMLTag.OVERVIEW_TEXT:
                self.recipe.other[self.label] = data.strip()
            case HTMLTag.INGREDIENT_QUANTITY:
                if self.current_section == HTMLTag.INGREDIENTS_LIST:
                    self.ingredient.quantity = str_to_fraction(data.strip())
            case HTMLTag.INGREDIENT_UNIT:
                if self.current_section == HTMLTag.INGREDIENTS_LIST:
                    self.ingredient.unit = data.strip()
            case HTMLTag.INGREDIENT_NAME:
                if self.current_section == HTMLTag.INGREDIENTS_LIST:
                    self.ingredient.name = data.strip()
                    self.recipe.ingredients.append(self.ingredient)
            case HTMLTag.STEP:
                if self.current_section == HTMLTag.STEPS_LIST:
                    parse_and_add_step(data.strip(), self.recipe)
        return super().handle_data(data)

    def handle_endtag(self, tag: str) -> None:
        # Reset tag
        self.current_tag = HTMLTag.UNKNOWN
        return super().handle_endtag(tag)

In [7]:
import re
import requests

def get_recipe_from_url(url: str) -> Recipe | None:
    '''Retrieves the text of a recipe from a given URL'''

    # Find recipe source; return None if unsupported
    source = RecipeSource.from_url(url)
    if source == RecipeSource.UNKNOWN:
        return None

    # Add appropriate HTTPS tag if not there
    if not re.match(r'https://www\.', url):
        if re.match(r'www\.', url):
            url = ''.join(['https://', url])
        else:
            url = ''.join(['https://www.', url])
    
    # Get the recipe from the page
    with requests.get(url) as f:
        parser = RecipeHTMLParser(source)
        parser.feed(f.text)
        return parser.recipe

In [8]:
recipe = get_recipe_from_url("https://www.allrecipes.com/recipe/230238/gingerbread-men-cookies/")
print(recipe.title, '\n')
for step in recipe.steps:
    print(step.text, step.methods)

Best Gingerbread Men Cookies 

Mix flour, ginger, cinnamon, baking soda, nutmeg, and salt in large bowl. ['mix']
Beat butter and brown sugar in large bowl with electric mixer on medium speed until light and fluffy. ['beat']
Add molasses, egg, and vanilla. ['add']
Mix well. ['mix']
Gradually beat in flour mixture on low speed until well mixed. ['beat']
Press dough into a thick, flat disk. ['press']
Wrap in plastic wrap. ['wrap']
Refrigerate 4 hours or overnight. ['refrigerate']
Preheat the oven to 350 degrees F (175 degrees C). ['preheat']
Roll dough on a lightly floured surface to a thickness of 1/4 inch. ['roll']
Cut into gingerbread men shapes with a 5-inch cookie cutter. ['cut']
Place gingerbread men 1 inch apart on ungreased baking sheets. ['place']
Bake in the preheated oven until edges of cookies are set and just begin to brown, 8 to 10 minutes. ['bake', 'begin']
Cool on baking sheets for 1 to 2 minutes. ['cool']
Remove to wire racks to cool completely. ['remove']
Decorate cooled

In [9]:
import spacy
nlp = spacy.load("en_core_web_md")

# text = "lay eggplant slices on paper towels"
# text = "sprinkle lightly with salt."
# text = "cook and stir until beef is browned, 8 to 10 minutes"
# text = "stir in ground beef, onions, and garlic"
text = "meanwhile, place ground beef, garlic, oregano, garlic powder, salt, and black pepper in a large skillet over medium heat"
# text = "add lasagna noodles and cook for 10 minutes or until al dente"
# text = "preheat the oven to 350 degrees F (175 degrees C)"
# text = "season béchamel sauce with salt and white pepper"
# text = "serve hot and enjoy!"

doc = nlp(text)
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])
print()
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

meanwhile advmod beef NOUN []
, punct beef NOUN []
place compound beef NOUN []
ground compound beef NOUN []
beef ROOT beef NOUN [meanwhile, ,, place, ground, ,, garlic, in]
, punct beef NOUN []
garlic conj beef NOUN [,, oregano]
, punct garlic NOUN []
oregano conj garlic NOUN [,, powder]
, punct oregano NOUN []
garlic amod powder NOUN []
powder conj oregano NOUN [garlic, ,, salt]
, punct powder NOUN []
salt conj powder NOUN [,, and, pepper]
, punct salt NOUN []
and cc salt NOUN []
black amod pepper NOUN []
pepper conj salt NOUN [black]
in prep beef NOUN [skillet]
a det skillet NOUN []
large amod skillet NOUN []
skillet pobj in ADP [a, large, over]
over prep skillet NOUN [heat]
medium amod heat NOUN []
heat pobj over ADP [medium]

meanwhile, place ground beef beef ROOT beef
garlic garlic conj beef
oregano oregano conj garlic
garlic powder powder conj oregano
salt salt conj powder
black pepper pepper conj salt
a large skillet skillet pobj in
medium heat heat pobj over
