In [35]:
import json
import re
import pandas as pd 
import numpy as np
from nltk.stem.porter import PorterStemmer
import string
from nltk import word_tokenize, pos_tag
import spacy
from collections import Counter
import os
import pickle

In [32]:
from sentence_transformers import SentenceTransformer, CrossEncoder
from ranker import TransformerRanker, CrossEncoderRanker
from preprocessor import *
from mapper import Mapper
from display_products import DisplayProducts
import joblib

In [3]:
filepath = "/Users/chahaksethi/Desktop/Target/data/1m_recipe/recipe1M_layers/layer1.json"
with open(filepath) as json_data:
    recipe = json.load(json_data)

In [9]:
# Set input file directory

# ip_file_dir = "../Data/Target Data/"
ip_file_dir = "/Users/chahaksethi/Desktop/Target/target/target_recipe_project/data/"
# Get grocery product hierarchy information
group10 = pd.read_csv(os.path.join(ip_file_dir, 
                                   'group4_header.csv'),
                      sep=',', 
                      low_memory=False)

# Get scraped information for the above products
products = pd.read_csv(os.path.join(ip_file_dir,
                                    'scraped/products_group4.csv'))

# Merge scraped information into the hierarchy table
group10 = pd.merge(group10, products, 
                   how = 'left', on = 'tcin')

# Preprocess the table
group10 = preprocess_df(group10)

In [11]:
bi_encoder_name = "multi-qa-MiniLM-L6-cos-v1"
cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-4-v2"
k=3

In [36]:
def get_embeddings(path):
#Load sentences & embeddings from disc
    with open(path, "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_embeddings = stored_data['embeddings']
    return stored_embeddings
    
def get_tcin_sentence_map(path):
#Load sentences & embeddings from disc
    with open(path, "rb") as fIn:
        stored_data = pickle.load(fIn)
        df = pd.DataFrame()
        df['tcin'] = stored_data['ids']
        df['sentence'] = stored_data['sentences']
    return df

bi_encoder = SentenceTransformer(bi_encoder_name)

# Get list of preprocessed product titles
product_titles = group10['title'].str.lower().values

l1_ranker = TransformerRanker(model=bi_encoder, product_ids=group10['tcin'], max_rank=3)
with open('/Users/chahaksethi/Desktop/Target/target/target_recipe_project/data/embeddings/hier_embeddings3.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_ids = stored_data['ids']
    stored_sentences = stored_data['sentences']
    stored_embeddings = stored_data['embeddings']
df = pd.DataFrame()
df['id'] = stored_data['ids']
df['sentence'] = stored_data['sentences']
l1_ranker.load_embeddings(stored_embeddings)

cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2', max_length=512)
l2_ranker = CrossEncoderRanker(bi_model=l1_ranker, 
                                cross_model=cross_encoder_model, 
                                tcin_sentence_map=get_tcin_sentence_map('/Users/chahaksethi/Desktop/Target/target/target_recipe_project/data/embeddings/hier_embeddings3.pkl'),
                                cross_rank=k,
                                bi_rank=30)

pm = Mapper(group10)
dp = DisplayProducts(ranker=l2_ranker, mapper=pm)

In [17]:
tool_indicator_regex = '(pan|skillet|dish|pot|sheet|grate|whisk|griddle|bowl|oven|saucepan|foil|mortar|pestle|pitcher|bag|cup|stick|blender|paper|knife|glass|brush|colander)'
method_indicator_regex = '(boil|bake|baking|stir|roast|fry|rinse|drain|sift|beat|fold|chop|slice|saute|grate|grill|cut)'



In [18]:
def recipe_load(s,n):
    recipe_instr=[]
    for i in range(s,n):
        title = recipe[i]['title']
        id = recipe[i]['id']
        
        for lis in recipe[i]['instructions']:
            for key, val in lis.items():   
                rem = re.sub("[\(\[].*?[\)\]]", "", val)
                if rem !='':
                    recipe_instr.append(rem)
    return ' '.join(recipe_instr)


In [20]:
def find_tools(instruction_words):
    """
    looks for any and all cooking tools apparent in the instruction text by using the tool_indicator_regex
    variable
    """
    cooking_tools = []
    for word in instruction_words:
        if re.search(tool_indicator_regex, word, flags=re.I):
#             print(word)
            cooking_tools.append(word)
        wordset = set(cooking_tools)
    return [item for item in wordset if item.istitle() or item.title() not in wordset]

def find_methods(instruction_words):
        """
        looks for any and all cooking methods apparent in the instruction text by using the method_indicator_regex
        variable
        """
        cooking_methods = []
        for word in instruction_words:
            if re.search(method_indicator_regex, word, flags=re.I):
                cooking_methods.append(word)
            if re.search('preheat', word, re.I):
                cooking_methods.append('bake')

        wordset = set(cooking_methods)
        return [item for item in wordset if item.istitle() or item.title() not in wordset]


In [21]:
instruction_words = word_tokenize(recipe_load(11000,11001))
cooking_tools = find_tools(instruction_words)
cooking_methods = find_methods(instruction_words)

In [22]:
cooking_tools

['dish', 'oven', 'bowl', 'colander', 'pot', 'oven-proof']

In [23]:
cooking_methods

['Chop', 'drained', 'boiling', 'Drain', 'Bake']

In [40]:
from sentence_transformers import util

class TransformerRanker:
    def __init__(self, model, product_ids, max_rank=100,  clf=None):
        self.model = model
        self.max_rank = max_rank
        self.product_ids = product_ids
        self.clf = clf
    
    def fit(self, documents):
        self.embeddings = self.model.encode(documents, 
                                            convert_to_tensor=True)
        
    def load_embeddings(self, embeddings):
        self.embeddings = embeddings
        
    def get_scores_ingredient(self, ingredient, max_rank=None):
        if not max_rank:
            max_rank=self.max_rank

        ingredient_embedding = self.model.encode(ingredient, convert_to_tensor=True)
        scores = util.pytorch_cos_sim(ingredient_embedding, self.embeddings)[0]
        product_scores = dict(zip(self.product_ids, scores.numpy()))
        product_scores = sorted(product_scores.items(), 
                                key = lambda x: x[1], 
                                reverse=True)[0:100]
        if self.clf:
            tcin_list =  [product_score[0] for product_score in product_scores]
            tcin_list = self.clf.filter_by_class(ingredient, tcin_list)
            product_scores = [product_score for product_score in product_scores if product_score[0] in tcin_list]
        return product_scores[0:max_rank]
        
    def get_scores_recipe(self, ingredient_list, max_rank=None):
        recipe_scores = []
        for ingredient in ingredient_list:
            ingredient_scores = self.get_scores_ingredient(ingredient, max_rank)
            recipe_scores.append(ingredient_scores)
        return recipe_scores

    def rank_products_ingredient(self, ingredient, max_rank=None):
        product_scores = self.get_scores_ingredient(ingredient, max_rank)
        return [product_score[0] for product_score in product_scores]
    
    def rank_products_recipe(self, ingredient_list, max_rank=None):
        recipe_scores = self.get_scores_recipe(ingredient_list, max_rank)
        return [[product_score[0] for product_score in product_scores] 
                for product_scores in recipe_scores]

    # Following code is not required
    def get_scores_ingredient_custom(self, ingredient, model, tokenizer):
        import torch
        #Mean Pooling - Take attention mask into account for correct averaging
        def mean_pooling(model_output, attention_mask):
            token_embeddings = model_output[0] #First element of model_output contains all token embeddings
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            return sum_embeddings / sum_mask
        #Tokenize sentences
        encoded_input = tokenizer(ingredient, padding=True, truncation=True, max_length=128, return_tensors='pt')
        #Compute token embeddings
        with torch.no_grad():
                model_output = model.encoder(
                    input_ids=encoded_input["input_ids"], 
                    attention_mask=encoded_input["attention_mask"], 
                    return_dict=True
                )
        #Perform pooling. In this case, mean pooling
        ingredient_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
        scores = util.pytorch_cos_sim(ingredient_embedding, self.embeddings)[0]
        product_score = dict(zip(self.product_ids, scores.numpy()))
        product_score = sorted(product_score.items(), 
                                key = lambda x: x[1], 
                                reverse=True)
        return product_score[0:self.max_rank]


class CrossEncoderRanker(TransformerRanker):
    def __init__(self, bi_model, cross_model, tcin_sentence_map, cross_rank=10, 
                 bi_rank=50):
        self.bi_model = bi_model
        self.cross_model = cross_model
        self.cross_rank = cross_rank
        self.bi_rank = bi_rank
        self.mapper = tcin_sentence_map

    def get_scores_ingredient(self, ingredient, max_rank=None):
        if not max_rank:
            max_rank = self.cross_rank
        if isinstance(ingredient, list):
            ingredient = ingredient[0]
        tcins = self.bi_model.rank_products_ingredient(ingredient, max_rank=self.bi_rank)
        sentences = []
        for tcin in tcins:
            print(tcin)
            print(self.mapper[self.mapper['tcin'] == tcin]['sentence'].values)
            sentences.append(self.mapper[self.mapper['tcin'] == tcin]['sentence'].values[0])
        pairs = [(ingredient, sentence.lower()) for sentence in sentences]
        scores = self.cross_model.predict(pairs)
        product_score = dict(zip(tcins, scores))
        product_score = sorted(product_score.items(), 
                                key = lambda x: x[1], 
                                reverse=True)
        return product_score[0:max_rank]



In [41]:
cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2', max_length=512)
l2_ranker = CrossEncoderRanker(bi_model=l1_ranker, 
                                cross_model=cross_encoder_model, 
                                tcin_sentence_map=get_tcin_sentence_map('/Users/chahaksethi/Desktop/Target/target/target_recipe_project/data/embeddings/hier_embeddings3.pkl'),
                                cross_rank=k,
                                bi_rank=30)

# pm = Mapper(group10)
# dp = DisplayProducts(ranker=l2_ranker, mapper=pm)

cooking__tools = preprocess(cooking_tools)

# Ranked list of product tcin matches for each ingredient - Returns a list of lists 
ranked_match = l2_ranker.rank_products_recipe(cooking__tools, 3)

[81994423, 84146438, 83344262, 84292397, 79310833, 84074794, 84650014, 78298231, 76826798, 79318304, 53623409, 76147615, 84184367, 83586751, 76167584, 83290191, 54314565, 75874459, 76150596, 83609142, 76148848, 51317539, 82293186, 81994389, 78833702, 76071368, 84292380, 84172714, 83046727, 80184757]
81994423
[]


IndexError: index 0 is out of bounds for axis 0 with size 0

In [101]:
text = recipe_load(100)
instruction_words = word_tokenize(text)
nlp = spacy.load('en_core_web_sm')

In [115]:
ins_lower = [word.lower().strip() for word in instruction_words]
ins_lower = [word for word in ins_lower if len(word) > 2]
doc = nlp(' '.join(ins_lower))
verbs = [token.text for token in doc if token.pos_ == "VERB"]
count = Counter(verbs)
count_sort = dict(sorted(count.items(), key=lambda item: item[1], reverse=True))

In [147]:
count_sort

{'add': 135,
 'stir': 41,
 'combine': 38,
 'set': 31,
 'bake': 30,
 'remaining': 28,
 'cut': 28,
 'mix': 27,
 'serve': 27,
 'let': 26,
 'beat': 26,
 'cook': 22,
 'preheat': 20,
 'put': 20,
 'sprinkle': 19,
 'serving': 18,
 'remove': 18,
 'make': 17,
 'using': 17,
 'baking': 16,
 'pour': 16,
 'bring': 14,
 'stirring': 13,
 'browned': 12,
 'toss': 11,
 'cover': 11,
 'place': 11,
 'spread': 11,
 'done': 11,
 'turn': 11,
 'use': 10,
 'chopped': 10,
 'are': 10,
 'filling': 10,
 'uncovered': 9,
 'continue': 9,
 'stand': 9,
 'allow': 9,
 'roll': 9,
 'keep': 9,
 'melt': 8,
 'refrigerate': 8,
 'inserted': 8,
 'boil': 8,
 'squash': 8,
 'saute': 8,
 'greased': 7,
 'cooled': 7,
 'prepared': 7,
 'combined': 7,
 'comes': 7,
 'take': 7,
 'spoon': 7,
 'cooked': 7,
 'melted': 6,
 'dressing': 6,
 'lined': 6,
 'fold': 6,
 'turning': 6,
 'reduce': 6,
 'beating': 6,
 'return': 6,
 'enjoy': 5,
 'peel': 5,
 'rack': 5,
 'transfer': 5,
 'desired': 5,
 'cooking': 5,
 'fill': 5,
 'granulated': 5,
 'have': 5,
 'w

In [None]:
# top_100_ingredients.to_csv('data/top_100_ingredients.csv', index_label = 'ingredients', header=['recipe_counts'])
# pd.read_csv('data/top_100_ingredients.csv')

In [None]:
nlp = spacy.load('en')

def merge_phrases(matcher, doc, i, matches):
    '''
    Merge a phrase. We have to be careful here because we'll change the token indices.
    To avoid problems, merge all the phrases once we're called on the last match.
    '''
    if i != len(matches)-1:
        return None
    spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
    for ent_id, label, span in spans:
        span.merge('NNP' if label else span.root.tag_, span.text, nlp.vocab.strings[label])


matcher = spacy.matcher.Matcher(nlp.vocab)
matcher.add(entity_key='1', label='ARTIST', attrs={}, specs=[[{spacy.attrs.ORTH: 'Rolling'}, {spacy.attrs.ORTH: 'Stones'}]], on_match=merge_phrases)
matcher.add(entity_key='2', label='ARTIST', attrs={}, specs=[[{spacy.attrs.ORTH: 'Muse'}]], on_match=merge_phrases)
matcher.add(entity_key='3', label='ARTIST', attrs={}, specs=[[{spacy.attrs.ORTH: 'Arctic'}, {spacy.attrs.ORTH: 'Monkeys'}]], on_match=merge_phrases)
doc = nlp(u'The Rolling Stones are an English rock band formed in London in 1962. The first settled line-up consisted of Brian Jones, Ian Stewart, Mick Jagger, Keith Richards, Bill Wyman and Charlie Watts')
matcher(doc)
for ent in doc.ents:
  print(ent)

In [None]:
ner_model = spacy.blank('en')  # create blank Language class
print("Created blank 'en' model")

In [None]:
TRAIN_DATA = [
    (x, {
        'entities': [(8, 28, 'TOOL'), (74, 89, 'METHOD')]
    })