In [1]:
import re
import json
import pandas as pd 
import numpy as np
from nltk.stem.porter import PorterStemmer
import string
from nltk import word_tokenize, pos_tag
import spacy
import os
import random
from sentence_transformers import SentenceTransformer, CrossEncoder
from ranker import TransformerRanker, CrossEncoderRanker, Classifier
from preprocessor import *
from mapper import Mapper
from display_products import DisplayProducts
import joblib
from nltk.corpus import stopwords
from pathlib import Path
stop_words = stopwords.words('english')
from spacy.training import Example
from spacy.scorer import Scorer
from spacy.matcher import Matcher
from pathlib import Path
PATH = Path("")
data_path = PATH.home()/'data/Target/data/'
model_path = PATH.home()/'data/Target/models/group4'
recipe_path = data_path/"layer1.json"
nlp = spacy.load("en_core_web_sm")

In [2]:

with open(recipe_path) as json_data:
    recipe = json.load(json_data)

In [3]:
# Set input file directory

# Get grocery product hierarchy information
group4 = pd.read_csv(os.path.join(data_path, 
                                   'group4_header.csv'),
                      sep=',', 
                      low_memory=False)

# Get scraped information for the above products
products = pd.read_csv(os.path.join(data_path,
                                    'products_group4.csv'))

# Merge scraped information into the hierarchy table
group4 = pd.merge(group4, products, 
                   how = 'left', on = 'tcin')

# Preprocess the table
group4 = preprocess_df(group4)

In [4]:
tool_indicator_regex = '(skillet|casserole|crockpot|steamer|ladle|dish|pot|sheet|tablespoon|processor|spoon|plate|whisk|griddle|mixer|grinder|bowl|oven|saucepan|foil|mortar|pestle|pitcher|bag|cup|blender|cooker|knife|glass|brush|colander|pan|fork)'
method_indicator_regex = '(boil|boiling|bake|sliced|stir|beat|roast|roasted|fry|rinse|saute|drain|strain|sift|beat|chop|slice|sliced|grate|grill|cut)'

def recipe_load(n):
    recipe_instr=[]
#     for i in range(s,n):
    title = recipe[n]['title']
    id = recipe[n]['id']
        
    for lis in recipe[n]['instructions']:
        for key, val in lis.items():   
            rem = re.sub("[\(\[].*?[\)\]]", "", val)
            if rem !='':
                recipe_instr.append(rem)
    return ' '.join(recipe_instr)

In [5]:
model = None

In [6]:
if model is not None:
    nlp1 = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp1 = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")

Created blank 'en' model


In [7]:
# create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp1.pipe_names:
    ner = nlp1.create_pipe('ner')
    nlp1.add_pipe('ner', last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp1.get_pipe('ner')

In [8]:
def find_tools(recipe):
    """
    looks for any and all cooking tools apparent in the instruction text by using the tool_indicator_regex
    variable
    """
    cooking_tools = []
    start=[]
    cooking_tool_ent=[]
#     for word in set(instruction_words):
    if len(re.findall(tool_indicator_regex, recipe, flags=re.I))>0:
        match =  list(set(re.findall(tool_indicator_regex, recipe , flags=re.I)))
        for word in match:
#             print(word)
            word_n = ' ' + word
            recipe = recipe + ' '
            for match in re.finditer(word_n, recipe, flags=re.I): #to find all occurences of a word in the recipe
                if recipe[match.end()].isalpha()==False: #to remove the substrings like pan in panini
                    cooking_tools.append((match.start()+1, match.end(), word))

    
    
    matcher = Matcher(nlp.vocab)
    pattern = [{"LOWER": "dutch"}, {"LOWER": "oven"}]
    matcher.add("DutchOven", [pattern])
    pattern = [{"LOWER": "electric"}, {"LOWER": "oven"}]
    matcher.add("ElectricOven", [pattern])
    pattern = [{"LOWER": "electric"}, {"LOWER": "skillet"}]
    matcher.add("ElectricSkillet", [pattern])
    pattern = [{"LOWER": "nonstick"}, {"LOWER": "skillet"}]
    matcher.add("NonstickSkillet", [pattern])
    pattern = [{"LOWER": "nonstick"}, {"LOWER": "pan"}]
    matcher.add("NonstickPan", [pattern])
    pattern = [{"LOWER": "muffin"}, {"LOWER": "liners"}]
    matcher.add("MuffinLiners", [pattern])
    pattern = [{"LOWER": "parchment"}, {"LOWER": "paper"}]
    matcher.add("ParchmentPaper", [pattern])
    pattern = [{"LOWER": "food"}, {"LOWER": "processor"}]
    matcher.add("ParchmentPaper", [pattern])
    pattern = [{"LOWER": "loaf"}, {"LOWER": "pan"}]
    matcher.add("LoafPan", [pattern])
    pattern = [{"LOWER": "loaf"}, {"LOWER": "pans"}]
    matcher.add("LoafPans", [pattern])
    pattern = [{"LOWER": "baking"}, {"LOWER": "sheet"}]
    matcher.add("BakingSheet", [pattern])
    pattern = [{"LOWER": "frying"}, {"LOWER": "pan"}]
    matcher.add("FryingPan", [pattern])
    pattern = [{"LOWER": "electric"}, {"LOWER": "kettle"}]
    matcher.add("ElectricKettle", [pattern])
    doc = nlp(recipe)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        cooking_tools.append((span.start_char, span.end_char, span.text))
    
    for i in range(len(cooking_tools)):
        c=0
        for j in range(len(cooking_tools)):
            if i!=j and cooking_tools[i][1]==cooking_tools[j][1]:
                c+=1
                if cooking_tools[i][0]<cooking_tools[j][0]: cooking_tool_ent.append((cooking_tools[i][0],cooking_tools[i][1], 'GADGET'))
        if c==0:
            cooking_tool_ent.append((cooking_tools[i][0],cooking_tools[i][1], 'GADGET'))
    return cooking_tool_ent
   

In [9]:
def find_methods(recipe):
    """
    looks for any and all cooking methods apparent in the instruction text by using the method_indicator_regex
    variable
    """
    cooking_methods = []
    start=[]
#         for word in set(instruction_words):
    if len(re.findall(method_indicator_regex, recipe, flags=re.I))>0:
        match =  list(set(re.findall(method_indicator_regex, recipe , flags=re.I)))
        for word in match:
            word_n = ' ' + word+' '
            recipe = recipe + ' '
            for match in re.finditer(word_n, recipe):
                if recipe[match.end()-1].isalpha()==False:
                    cooking_methods.append((match.start()+1, match.end()-1, word))
#                 print(cooking_methods)
    cooking_meth_ent = [(i[0],i[1], 'METHOD') for i in cooking_methods]
    return cooking_meth_ent

In [10]:
n_train = np.random.choice(len(recipe) , 500 , replace = False)
n_valid = np.random.choice(len(recipe) , 100 , replace = False)
data_train, data_valid=[], []
reciple_in=[]
for num, n in enumerate([n_train, n_valid]):
    for i in n:
        reciple_in.append(i)
        recipe_loaded = recipe_load(i)
        recipe_lower = recipe_loaded.lower()
        recipe_trans = recipe_lower.replace('\n', ' ')
        recipe_trans = recipe_trans.replace(',', ' ')
        recipe_trans = recipe_trans.replace('-', ' ')
        recipe_trans = recipe_trans.replace('(', ' ')
        recipe_trans = recipe_trans.replace(')', ' ')
        recipe_trans = recipe_trans.replace('.', ' ')
        recipe_trans = recipe_trans.replace('@', ' ')
        recipe_trans = recipe_trans.replace(';', ' ')
        cooking_tools = find_tools(recipe_trans)
        cooking_methods = find_methods(recipe_trans)
        cook = cooking_tools + cooking_methods
        if num==0:
            data_train.append((recipe_trans, {'entities': cook}))
        else: 
            data_valid.append((recipe_trans, {'entities': cook}))
    

In [11]:
add_ents = ['GADGET','METHOD']

for ent in add_ents:
    ner.add_label(ent)

In [12]:
def evaluate(ner_model, valid_data):
    examples = []
    scorer = Scorer()
    for text, annotations in valid_data:
        doc = ner_model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        example.predicted = ner_model(str(example.predicted))
        examples.append(example)
    return scorer.score(examples)

#  ents_p, the recall as ents_r and the F1 score as ents_f.

In [41]:
# data_train

In [44]:
# get names of other pipes to disable them during training
n_iter = 6
other_pipes = [pipe for pipe in nlp1.pipe_names if pipe != 'ner']
with nlp1.disable_pipes(*other_pipes):  # only train NER
    if model is None:
        optimizer = nlp1.begin_training()
    else:
        optimizer = nlp1.resume_training()
    for itn in range(n_iter):
        random.shuffle(data_train)
        losses = {}
        for batch in spacy.util.minibatch(data_train, size=50):
            for text, annotations in batch:
                try:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp1.update(
                        [example],
                         drop = 0.25, # dropout 
                        sgd=optimizer,  # callable to update weights
                        losses=losses)
                except Exception as error:
                    continue
        eval_result = evaluate(nlp1, data_valid)
        print("Iteration ",itn+1, " Loss: ", losses, "Valid F1 score: ", eval_result['ents_f'])
print("Final loss: ", losses)

Iteration  1  Loss:  {'ner': 2394.257412535428} Valid F1 score:  0.9738610903659447
Iteration  2  Loss:  {'ner': 194.90777646193067} Valid F1 score:  0.994772218073189
Iteration  3  Loss:  {'ner': 106.28244375321871} Valid F1 score:  0.9925705794947993
Iteration  4  Loss:  {'ner': 93.13931543095876} Valid F1 score:  0.9947877885331348
Iteration  5  Loss:  {'ner': 60.734136390351274} Valid F1 score:  0.9970193740685545
Iteration  6  Loss:  {'ner': 75.94186795346273} Valid F1 score:  0.9962714392244595
Final loss:  {'ner': 75.94186795346273}


In [45]:
output_dir=Path(model_path/"NER_7")

if not output_dir.exists():
        output_dir.mkdir()
nlp1.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to /Users/chahaksethi/Desktop/Target/models/NER_7


In [40]:
a = 'heat the oven to 400f. cook bacon in a large skillet or dutch oven until crisp. remove bacon from skillet; reserve drippings. brown the chicken in hot bacon drippings, turning to brown all sides; drain. place chicken in a 2 1/2 to 3-quart casserole dish;sprinkle with bacon. combine onions, mushrooms, 2 tablespoons parsley, thyme and garlic in same skillet as before. cook over medium heat until thoroughly heated, stirring occasionally. stir in flour. gradually stir in wine. cook until mixture boils and thickens stirring constantly. pour over chicken and bacon in casserole. cover, bake at 400f for 40-50 minutes or until the chicken is fork tender and juices run clear. or if using a meat thermometer chicken is done when it holds at 180f for ten seconds. sprinkle with parsley.'
print(a[98: 105])
print(a[350: 357])
a = a.replace('\n', ' ')
a = a.replace(',', ' ')
a = a.replace('-', ' ')
a = a.replace('(', ' ')
a = a.replace(')', ' ')
a = a.replace('.', ' ')
a = a.replace('@', ' ')
a = a.replace(';', ' ')
a = a.lower()
instruction_words = word_tokenize(a)
ins_stop_wrds_rm = [word for word in instruction_words if word not in stop_words]
find_tools(a)


# b ="Preheat oven to 325 degrees  You will need two muffin pans that hold 12 muffins each  Lightly spray with cooking spray  In a large mixing bowl  beat together the butter and sugar  Add the vanilla and eggs one at a time  beating well after each addition  Combine the flour and baking powder together in a bowl and add to the butter  sugar  and egg mixture  Mix on low speed until well blended the batter will be very thick  Divide the batter evenly among the 24 cups  This will fill them about half way  For the topping  mix together the flour  brown sugar  and cinnamon in a bowl  Add the hard butter and cut it in to the dry ingredients until it resembles coarse crumbs you can use your hands for this  rubbing it between your fingertips  Don't make it too fine  you want it to have some body to it  Note: I use my hand held pastry blender tool  Put about 1 tablespoon of the streusel topping over each cake using up all of the topping;you will not see much of the batter  Bake in a preheated oven on two shelves for about 18 minutes  switching shelves after the first 10 minutes  They are done when a toothpick inserted in the middle comes out clean  Do not overbake;start checking them at about 16 minutes  Cool 5 minutes in the pans then remove to a cooling rack "
# print(b[275:283])
# print(b[219:228])
# print(b[1159:1169])
# print(b[973:979])

# instruction_words = word_tokenize(b)
# ins_stop_wrds_rm = [word for word in instruction_words if word not in stop_words]
# find_methods(ins_stop_wrds_rm, b)

skillet
skillet


[(9, 13, 'GADGET'),
 (62, 66, 'GADGET'),
 (249, 253, 'GADGET'),
 (45, 52, 'GADGET'),
 (98, 105, 'GADGET'),
 (350, 357, 'GADGET')]

In [73]:
import spacy
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "dutch"}, {"LOWER": "oven"}]
matcher.add("DutchOven", [pattern])
pattern = [{"LOWER": "electric"}, {"LOWER": "oven"}]
matcher.add("ElectricOven", [pattern])
pattern = [{"LOWER": "electric"}, {"LOWER": "skillet"}]
matcher.add("ElectricSkillet", [pattern])
pattern = [{"LOWER": "nonstick"}, {"LOWER": "skillet"}]
matcher.add("NonstickSkillet", [pattern])
pattern = [{"LOWER": "nonstick"}, {"LOWER": "pan"}]
matcher.add("NonstickPan", [pattern])

doc = nlp(a)
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    cooking_tools.append((span.start_char, span.end_char, span.text))

[(4552378691801231685, 13, 15), (4552378691801231685, 154, 156)]
<class 'spacy.tokens.span.Span'>
4552378691801231685 DutchOven 13 15 56 66 dutch oven
<class 'spacy.tokens.span.Span'>
4552378691801231685 DutchOven 154 156 783 793 dutch oven


In [75]:
a[783:793]

'dutch oven'