In [15]:
import os
import re
import json
from ingredient_phrase_tagger.training import utils
from string import punctuation
import sklearn_crfsuite
from nltk.tokenize import *
import re
import json
from itertools import chain
import nltk
import pycrfsuite
import pickle as cPickle
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import subprocess
import numpy as np
from googletrans import Translator




tokenizer = PunktSentenceTokenizer()

filename = 'finalized_model.pkl'
loaded_model = cPickle.load(open(filename, 'rb'))
tagger = loaded_model.tagger_

def get_ingredients_url(url):

    page = requests.get(f'{url}')
    soup = BeautifulSoup(page.content, 'html.parser')
    ingredient = ''

    for a in soup.find_all('li', class_ = "recipe-ingredients__list-item"):
        ingredient += a.get_text()+ '.'
        ingredient += '\n'

    servingsize = soup.find('p', class_ = "recipe-metadata__serving").get_text().split(' ')[1]
    recipe_title = soup.find('h1', class_ = 'gel-trafalgar content-title__text').get_text()

    try:
        servingsize = str(servingsize).split('-')[0]
    except:
        pass

    return ingredient, servingsize, recipe_title

def sent2labels(sent):
    return [word[-1] for word in sent]

def sent2features(sent):
    return [word[:-1] for word in sent]

def sent2tokens(sent):
    return [word[0] for word in sent]


def get_sentence_features(sent):
    """Gets  the features of the sentence"""
    sent_tokens = nltk.word_tokenize(utils.cleanUnicodeFractions(sent))

    sent_features = []
    for i, token in enumerate(sent_tokens):
        token_features = [token]
        token_features.extend(utils.getFeatures(token, i+1, sent_tokens))
        sent_features.append(token_features)
    return sent_features

def format_ingredient_output(tagger_output, display=False):
    """Formats the tagger output into a more convenient dictionary"""
    data = [{}]
    display = [[]]
    prevTag = None


    for token, tag in tagger_output:
    # turn B-NAME/123 back into "name"
        tag = re.sub(r'^[BI]\-', "", tag).lower()

        # ---- DISPLAY ----
        # build a structure which groups each token by its tag, so we can
        # rebuild the original display name later.

        if prevTag != tag:
            display[-1].append((tag, [token]))
            prevTag = tag
        else:
            display[-1][-1][1].append(token)
            #               ^- token
            #            ^---- tag
            #        ^-------- ingredient

            # ---- DATA ----
            # build a dict grouping tokens by their tag

            # initialize this attribute if this is the first token of its kind
        if tag not in data[-1]:
            data[-1][tag] = []

        # HACK: If this token is a unit, singularize it so Scoop accepts it.
        if tag == "unit":
            token = utils.singularize(token)

        data[-1][tag].append(token)

    # reassemble the output into a list of dicts.
    output = [
        dict([(k, utils.smartJoin(tokens)) for k, tokens in ingredient.items()])
        for ingredient in data
        if len(ingredient)
    ]

    # Add the raw ingredient phrase
    for i, v in enumerate(output):
        output[i]["input"] = utils.smartJoin(
            [" ".join(tokens) for k, tokens in display[i]])

    return output

def parse_ingredient(sent):
    """ingredient parsing logic"""
    sentence_features = get_sentence_features(sent)
    tags = tagger.tag(sentence_features)
    tagger_output = zip(sent2tokens(sentence_features), tags)
    parsed_ingredient =  format_ingredient_output(tagger_output)
    if parsed_ingredient:
        parsed_ingredient[0]['name'] = parsed_ingredient[0].get('name','').strip('.')

    return parsed_ingredient



def parse_recipe_ingredients(ingredient_list):

    """Wrapper around parse_ingredient so we can call it on an ingredient list"""
    sentences = tokenizer.tokenize(ingredient_list)
    sentences = [sent.strip('\n') for sent in sentences]
    names = []
    qtys = []
    units = []
    our_punctuation = '!"#$%&\'())*+:;<=>?@[\\]^_`{|}~'

    for sent in sentences:
        for punctuation in our_punctuation:
        # cleaning for common issues
            sent = sent.replace(punctuation, '')

        sent = sent.replace('can', '')
        sent = sent.replace('package', '')
        sent = sent.replace('container', '')
        sent = sent.replace('eggs eggs', 'eggs')
        sent = sent.replace('⅓', '.33')
        sent = sent.replace('½', '.5')
        sent = sent.replace('¼', '.25')
        sent = sent.replace('¾', '.75')
        sent = sent.replace('tsp', 'teaspoon')
        sent = sent.replace('tbsp', 'tablespoon')
        sent = sent.replace('large', '')
        sent = sent.replace('medium', '')
        sent = sent.replace('small', '')
        sent = sent.replace('kg', '000g')
        sent = sent.replace('aubergine', 'eggplant')

        if re.search("\dg", sent) is not None:
            sent = sent.replace("g", "gram", 1)


        parsed_ingredient = parse_ingredient(sent)

        print(parsed_ingredient)
        if 'name' in parsed_ingredient[0].keys():

            tmp = parsed_ingredient[0]['name']
            useless_quantifiers = ['oz', 'fl', 'ounce']

            try:
                names.append(re.search("[^\d]*$", tmp).group(0))
            except:
                names.append(tmp)

        else:
            names.append(np.nan)

        if 'gram' in parsed_ingredient[0]['input']:
            units.append('gram')
        elif 'milliliters' in parsed_ingredient[0]['input']:
            units.append('ml')
        elif 'unit' in parsed_ingredient[0].keys():
            units.append(parsed_ingredient[0]['unit'])
        #elif 'kg' or 'kilogram' in parsed_ingredient[0]['input']:
        #   units.append('kilogram')
        else:
            units.append('unit')

        if re.search("\dg", sent) is not None:
            try:
                qtys.append(re.search("\d+(?=\s*g)", parsed_ingredient[0]['input']).group(0))
            except:
                pass
        elif re.search("\dkg", sent) is not None:
            try:
                qtys.append(re.search("\d+(?=\s*kg)", parsed_ingredient[0]['input']).group(0))
            except:
                pass
        elif 'qty' in parsed_ingredient[0].keys():
            qtys.append(parsed_ingredient[0]['qty'])
        else:
            try:
                qtys.append(float(parsed_ingredient[0]['input'][:3]))
            except:
                qtys.append(np.nan)


    final_df = pd.DataFrame(list(zip(qtys, units, names)), columns = ['qty', 'unit', 'name'])

    final_df = final_df[final_df['name'].notna()]
    final_df = final_df[final_df['unit'].notna()]

    final_df.loc[final_df['unit'] == 'teaspoon', 'qty'] = 0
    final_df.loc[final_df['unit'] == 'teaspoon', 'unit'] = 'gram'
    final_df.loc[final_df['qty'].astype(str) == 'nan', 'qty'] = 1

    return final_df, parsed_ingredient


In [16]:
test_list = get_ingredients_url('https://www.bbc.co.uk/food/recipes/sweet_and_sour_slaw_07245')

In [23]:
def parse_recipe_ingredients(ingredient_list):

    """Wrapper around parse_ingredient so we can call it on an ingredient list"""
    sentences = tokenizer.tokenize(ingredient_list)
    sentences = [sent.strip('\n') for sent in sentences]
    names = dict()
    qtys = dict()
    units = dict()
    our_punctuation = '!"#$%&\'())*+:;<=>?@[\\]^_`{|}~'

    for sent in sentences:
        for punctuation in our_punctuation:
        # cleaning for common issues
            sent = sent.replace(punctuation, '')

        sent = sent.replace('can', '')
        sent = sent.replace('package', '')
        sent = sent.replace('container', '')
        sent = sent.replace('eggs eggs', 'eggs')
        sent = sent.replace('⅓', '.33')
        sent = sent.replace('½', '.5')
        sent = sent.replace('¼', '.25')
        sent = sent.replace('¾', '.75')
        sent = sent.replace('tsp', 'teaspoon')
        sent = sent.replace('tbsp', 'tablespoon')
        sent = sent.replace('large', '')
        sent = sent.replace('medium', '')
        sent = sent.replace('small', '')
        sent = sent.replace('kg', '000g')
        sent = sent.replace('aubergine', 'eggplant')

        if re.search("\dg", sent) is not None:
            sent = sent.replace("g", "gram", 1)


        parsed_ingredient = parse_ingredient(sent)

        if 'name' in parsed_ingredient[0].keys():

            tmp = parsed_ingredient[0]['name']
            useless_quantifiers = ['oz', 'fl', 'ounce']

            try:
                name = re.search("[^\d]*$", tmp).group(0)
            except:
                name = tmp

        else:
            name = np.nan

        if 'gram' in parsed_ingredient[0]['input']:
            units[name] = 'gram'
        elif 'milliliters' in parsed_ingredient[0]['input']:
            units[name] = 'ml'
        elif 'unit' in parsed_ingredient[0].keys():
            units[name] = parsed_ingredient[0]['unit']
        else:
            units[name] = 'unit'

        if re.search("\dg", sent) is not None:
            try:
                qtys[name] = re.search("\d+(?=\s*g)", parsed_ingredient[0]['input']).group(0)
            except:
                pass
        elif 'qty' in parsed_ingredient[0].keys():
            qtys[name] = parsed_ingredient[0]['qty']
        else:
            try:
                qtys[name] = float(parsed_ingredient[0]['input'][:3])
            except:
                qtys[name] = np.nan

    return units, qtys

    #final_df = pd.DataFrame(list(zip(qtys, units, names)), columns = ['qty', 'unit', 'name'])

    #final_df = final_df[final_df['name'].notna()]
    #final_df = final_df[final_df['unit'].notna()]

    #final_df.loc[final_df['unit'] == 'teaspoon', 'qty'] = 0
    #final_df.loc[final_df['unit'] == 'teaspoon', 'unit'] = 'gram'
    #final_df.loc[final_df['qty'].astype(str) == 'nan', 'qty'] = 1

    #return final_df


def parse_ingredient(sent):
    """ingredient parsing logic"""
    sentence_features = get_sentence_features(sent)
    tags = tagger.tag(sentence_features)
    tagger_output = zip(sent2tokens(sentence_features), tags)
    parsed_ingredient =  format_ingredient_output(tagger_output)
    if parsed_ingredient:
        parsed_ingredient[0]['name'] = parsed_ingredient[0].get('name','').strip('.')

    return parsed_ingredient

def get_sentence_features(sent):
    """Gets  the features of the sentence"""
    sent_tokens = nltk.word_tokenize(utils.cleanUnicodeFractions(sent))

    sent_features = []
    for i, token in enumerate(sent_tokens):
        token_features = [token]
        token_features.extend(utils.getFeatures(token, i+1, sent_tokens))
        sent_features.append(token_features)
    return sent_features

tokenizer = PunktSentenceTokenizer()

In [25]:
unit, qty = parse_recipe_ingredients(test_list[0])

In [28]:
unit.keys()

dict_keys(['cabbage approx ', '', 'fat onions', 'red peppers', 'yellow pepper', 'orange pepper', 'red chilli', 'approx ', 'coriander ', 'pineapple juice', 'limes', 'sea salt flakes', 'sesame oil ', 'maple syrup '])

In [30]:
units = []
qtys = []
names = []
for item in unit.keys():
    if unit.keys() != '':
        names.append(item)
        qtys.append(qty[item])
        units.append(unit[item])

In [225]:
final_df = pd.DataFrame(list(zip(qtys, units, names)), columns = ['qty', 'unit', 'raw_ingredient'])



In [609]:
a, df, b, c = calculate('https://www.bbc.co.uk/food/recipes/chana_daal_with_tarka_62765')

    qty        unit                        name
0   250        gram                oz chana dal
1     1       clove                     garlic 
2     0        gram          red chilli powder 
3     0        gram                   turmeric 
4     1        unit             cinnamon stick 
5     1        unit                       onion
6   1–2  tablespoon                       ghee 
8   1–2  tablespoon                       ghee 
9     2       clove                      garlic
10    0        gram                cumin seeds 
11    0        gram              mustard seeds 
12    2        unit               red chillies 
13    1        unit                curry leaves
14    2  tablespoon            tamarind chutney
15  2.5        unit        in piece root ginger
16    1     handful  coriander dill and chervil
17    0        gram               chaat masala 
['Peas', 'Onions & Leeks', 'Chilli and pepper', 'Biscuit', 'Cinnamon', 'Onions', 'Green lentils', 'Green lentils', 'Onions & Leeks', 'Ot

ValueError: invalid literal for int() with base 10: '3–4'

In [616]:
from scraper_parser import *

In [624]:
def get_ingredients_url(url):

    page = requests.get(f'{url}')
    soup = BeautifulSoup(page.content, 'html.parser')
    ingredient = ''

    for a in soup.find_all('li', class_ = "recipe-ingredients__list-item"):
        ingredient += a.get_text()+ '.'
        ingredient += '\n'

    servingsize = soup.find('p', class_ = "recipe-metadata__serving").get_text().split(' ')[1]
    recipe_title = soup.find('h1', class_ = 'gel-trafalgar content-title__text').get_text()

    try:
        servingsize = (float(servingsize[0])+float(servingsize[-1]))/2
    except:
        pass

    return ingredient, servingsize, recipe_title

In [625]:
get_ingredients_url('https://www.bbc.co.uk/food/recipes/chana_daal_with_tarka_62765')

('250g/9oz chana dal, soaked overnight or for at least 2 hours.\n1 garlic clove.\n1 tsp red chilli powder.\n½ tsp ground turmeric.\n1 cinnamon stick.\n1 onion, thinly sliced.\n1–2 tbsp ghee .\nsalt, to taste.\n1–2 tbsp ghee .\n2 garlic cloves, thinly sliced.\n1 tsp cumin seeds.\n1 tsp mustard seeds.\n2 long dried red chillies.\n5–6 curry leaves (fresh if possible).\n2 tbsp tamarind chutney (available in Asian shops).\n2.5cm/1in piece root ginger, peeled and very thinly sliced.\nhandful mixed fresh coriander, dill and chervil, finely chopped.\n1 tsp chaat masala.\n',
 3.5,
 'Chana dal with tarka and tamarind ')

In [621]:
cd greenr

/Users/georgesdelrieu/code/g-delrieu/greenr/greenr


In [620]:
ls

Homemade CRF.ipynb              matching_objects.pk
MANIFEST.in                     matching_objects2.pk
Makefile                        mongo_pwd.pkl
OUR CRF MODEL THAT WORKS.ipynb  [1m[36mnotebooks[m[m/
Procfile                        [1m[36mparsing_tools[m[m/
README.md                       [1m[36mraw_data[m[m/
Test mongo.ipynb                requirements.txt
[1m[36mbuild[m[m/                          [1m[36mscripts[m[m/
[1m[36mdata[m[m/                           setup.py
[1m[36mdist[m[m/                           setup.sh
error.gif                       style.css
finalized_model.pkl             tagger.pkl
[1m[36mgreenr[m[m/                         test
[1m[36mgreenr.egg-info[m[m/                [1m[36mtests[m[m/
loading.gif                     train_file
