In [5]:
import nltk

In [6]:
TEST_INPUT = [
'6c',
'6tsp',
'6dingus cup / 3',
'6 cup',
'6 cups',
'6 cups <a href="https://cooking.nytimes.com/recipes/1021916-vegan-bolognese">vegan Bolognese</a>',
'1 cup flour',
'1 c flour',
'1.5c flour',
'1 1/2c flour',
'1 (5- to 6-ounce) can or jar tuna, drained and flaked, or 1 (13-ounce) can chickpeas or white beans, drained',
'1/2 pound fresh tuna, grilled or 6 1/2- to 7-ounce can albacore tuna, packed in water',
'Two 5-ounce (140g) cans tuna in olive oil, drained (or 10 ounces/280g shredded roast chicken meat)',
'2 (6-ounce) cans Italian tuna in water or oil, drained',
'1 (5-ounce) can tuna packed in olive oil, preferably Italian (see note)'
'2 4-pound Atlantic salmon (2 1/4 inches at thickest point), scaled and cleaned, gills removed, head and tail on, interior cavity well washed',
'For the filling:  ',
'1 packed cup cilantro, coarsely chopped',
'4 (6-ounce) mild white fish fillets (for example, cod, hake or blackfish)',
'1 (10- to 14-pound) turkey',
'1 (10- to 14- pound) turkey',
'1¼ cup/80 grams plus 2 teaspoons/5 grams mild honey',
'1 six-to-eight-pound, cleaned, whole salmon, preferably with head left on (see note)',
'1 5-to-6-pound, cleaned, whole salmon, preferably with head left on (see note)',
'1 5- to 6-pound, cleaned, whole salmon, preferably with head left on (see note)',
'1 (4-ounce) can smoked mussels',
'1 (1 1/2-pound) salmon fillet, skin-on or skinless',
'1 salmon about 4 1/2 pounds, boned with head and tail left on',
'4 whole fish, like sea bass or black bass, 1 to 1 1/2 pounds each',
'4 whole fish,, like sea bass or black bass, 1 to 1.5 pounds each',
'1 salmon or other firm fish, about 2 pounds, gutted and scaled, with the head left on',
'1 scallion, chopped, for serving',
'1 or 2 cup',
'5 to 7 handful',
'yada yada',
'chopped yada yada',
'1/4 cup/80 grams mild honey',
'1¼ cup/80 grams mild honey',
'1¼ cup (approx. 80 grams) mild honey',
'350g (approx. 1 1/2 cups) mild honey',
'unsalted, softened butter, softened: 9 tablespoons (4.5 ounces or 128 grams)'
]

In [16]:
import re

from parser import devulgarize, strip_html_tags
from parser import units_group
units = r'{}[sei]*$'.format(units_group)
gapless_units = r'(\d)({})\b'.format(units_group)

all_parens = r'^\([^\)]*\)$'
colon_anywhere = r'.*:.*'
DISREGARD_HEADERS = [
    r'accompaniments?:',
    r'equipment:',
    r'for .*:',
    r'garnish(es)?:',
    r'glass(ware)?:',
    r'grill heat:',
    r'ingredient info:',
    r'note:',
    r'serving suggestion(s)?:',
    r'special equipment:',
    r'test-kitchen tip:',
    r'type of fire:',
]

def disregard(text: str) -> bool:
    """ Return True if text:
    * ends in a colon ":" as this almost always means it's a directive
     * There are a few cases where a quantity is provided as part of this, but this is very rare, and more often than not the quantity is redundant with subsequent entries. We'll live with the misses here.
    * begins with certain other directives, which typically include a colon, e.g. "Equipment:...", "Accompaniment:...", "Ingredient info:..."
    * is entirely parenthetical, e.g. "(Essential oil complement: orange)"
    * Things that are marked optional TBD
        * I lean towards ignoring these, too
    """

    if not text: # empty
        return True
    if text.endswith(':'):
        return True
    if re.match(all_parens, text):
        return True

    if re.match(colon_anywhere, text):
        lowered = text.lower()
        # re.match only considers string start, which is what we want
        if any([re.match(pttrn, lowered) for pttrn in DISREGARD_HEADERS]):
            return True
    
    return False

def gap_units(text: str) -> str:
    return re.sub(gapless_units, r'\g<1> \g<2>', text)  # parenthesize

def preprocess(text: str) -> str:
    """
    Preprocess/standardize ingredient text in preparation for analysis:
        * Strip leading/trailing whitespace
        * Strip out any html tags
            * At some point, we need to decide the "correct" way to handle inlined recipe links and whether we want to do anything about that
        * Expand vulgar fractions, prepending a space, e.g. "1¼" -> "1 1/4" (nltk will take 1/4 as a number, huzzah!)
        * Put a space between spaceless units, e.g. "6c" -> "6 c"
        * Convert fractional numbers to decimals (?)
    """
    text = text.strip() # remove leading/trailing whitespace
    text = strip_html_tags(text)
    text = devulgarize(text)
    text = gap_units(text)

    return text

def tokenize(text:str) -> list:
    """ A wrapper on nltk.word_tokenize, except when
    we see things like "grams/3" in the nltk tokens (e.g. from an ingredient that read
    "45 grams/3 ounces of oil"), replace it with "grams", "/", "3" so we can use the
    slash as an indicator of an alternative measure.
    """
    # tokenize and tag the text usin nltk defaults
    tokens = nltk.word_tokenize(text)
    out = []
    for t in tokens:
        if '/' not in t:
            out += [t]
            continue

        parts = t.split('/')
        digits = len([el for el in parts if el.isdigit()])
        if digits == len(parts):
            out += [t]
            continue

        for i, p in enumerate(parts):
            if p: # don't include empty strings
                out += [p]
            if i < len(parts) - 1:
                out += ['/']

    return out

def tag(tokens: list) -> list:
    data = nltk.pos_tag(tokens)

    out = []
    for datum in data:
        if re.match(units, datum[0]):
            out += [(datum[0], 'UNIT')]
        else:
            out += [datum]

    return out


def parse(raw_text: str):
    """ Extract quantity and name information from an ingredient entry
    """
    text = preprocess(raw_text)
    if disregard(text):
        return None

    tokens = tokenize(text)

    tagged_data = tag(tokens)

    return tagged_data

In [17]:
# strip html tags, condense multiple whitespace -> text
# split into tokens on spaces, commas -> [(text, start, stop)]
# tag quantities and units
# print('{} ->\n {}'.format(TEST_INPUT[-5], parse(TEST_INPUT[-5])))
# print(gap_units('6c 5Cup whatever'))
for input in TEST_INPUT:
    # continue
    print('{} ->\n {}'.format(input, parse(input)))


6c ->
 [('6', 'CD'), ('c', 'UNIT')]
6tsp ->
 [('6', 'CD'), ('tsp', 'UNIT')]
6dingus cup / 3 ->
 [('6dingus', 'CD'), ('cup', 'UNIT'), ('/', 'CD'), ('3', 'CD')]
6 cup ->
 [('6', 'CD'), ('cup', 'UNIT')]
6 cups ->
 [('6', 'CD'), ('cups', 'UNIT')]
6 cups <a href="https://cooking.nytimes.com/recipes/1021916-vegan-bolognese">vegan Bolognese</a> ->
 [('6', 'CD'), ('cups', 'UNIT'), ('vegan', 'JJ'), ('Bolognese', 'JJ')]
1 cup flour ->
 [('1', 'CD'), ('cup', 'UNIT'), ('flour', 'VBP')]
1 c flour ->
 [('1', 'CD'), ('c', 'UNIT'), ('flour', 'VBP')]
1.5c flour ->
 [('1.5', 'CD'), ('c', 'UNIT'), ('flour', 'VBP')]
1 1/2c flour ->
 [('1', 'CD'), ('1/2', 'CD'), ('c', 'UNIT'), ('flour', 'VBP')]
1 (5- to 6-ounce) can or jar tuna, drained and flaked, or 1 (13-ounce) can chickpeas or white beans, drained ->
 [('1', 'CD'), ('(', '('), ('5-', 'JJ'), ('to', 'UNIT'), ('6-ounce', 'NN'), (')', ')'), ('can', 'UNIT'), ('or', 'CC'), ('jar', 'UNIT'), ('tuna', 'UNIT'), (',', ','), ('drained', 'VBN'), ('and', 'CC'), ('fl

In [None]:
from parser import amounts

amounts('1/4 cup/80 grams mild honey')

1/4 cup / 80 grams mild honey
1/4 cup / 80 grams honey
[{'unit': 'cup', 'qty': 0.25, 'qualifiers': [], 'per': None, 'plus': False}, {'unit': 'g', 'qty': 80.0, 'qualifiers': [], 'per': None, 'plus': False}]


{'qtys': [{'unit': 'cup',
   'qty': 0.25,
   'qualifiers': [],
   'per': None,
   'plus': False},
  {'unit': 'g', 'qty': 80.0, 'qualifiers': [], 'per': None, 'plus': False}],
 'names': ['honey'],
 'mods': [],
 'stripped_words': ['mild']}

In [None]:
ing = '1 ¼ 1/4 1.25 cup/80 grams mild honey, dusted with five-spice 3- to 4-ounce powder'
nltk.pos_tag(nltk.word_tokenize(ing))

[('1', 'CD'),
 ('¼', 'JJ'),
 ('1/4', 'CD'),
 ('1.25', 'CD'),
 ('cup/80', 'NN'),
 ('grams', 'NNS'),
 ('mild', 'VBP'),
 ('honey', 'NN'),
 (',', ','),
 ('dusted', 'VBN'),
 ('with', 'IN'),
 ('five-spice', 'JJ'),
 ('3-', 'JJ'),
 ('to', 'TO'),
 ('4-ounce', 'JJ'),
 ('powder', 'NN')]