In [1]:
import requests
import json
import re
import os
import pandas as pd 
import numpy as np
import random
import pickle
from recipe_loading import *
from qty_mapping import *
from sentence_transformers import SentenceTransformer, CrossEncoder
from ranker import *
from preprocessor import *
from mapper import Mapper
from display_products import DisplayProducts
import joblib

The steps followed here are:

1) Load 1m recipe
2) Standardise the units
3) Load density info
4) Calculate the required amount of each ingredient

In [2]:
from pathlib import Path
PATH = Path("")
data_path = PATH.home()/'data/Target/data/'
model_path = PATH.home()/'data/Target/models/group10'
recipe_path = data_path/"layer1.json"
recipe_image_path = data_path/"layer2.json"
op_file_path = data_path/"ing_density.csv"

# Load recipes
with open(recipe_path) as json_data:
    recipe = json.load(json_data)

In [3]:
# Load recipe images
with open(recipe_image_path) as json_data:
    recipe_images = json.load(json_data)

In [4]:
# Get grocery product hierarchy information
group10 = pd.read_csv(data_path/'group10_header.csv',
                      sep='\t', 
                      low_memory=False)

# Get scraped information for the above products
products = pd.read_csv(data_path/'products.csv')

# Merge scraped information into the hierarchy table
group10 = pd.merge(group10, products, 
                   how = 'left', on = 'tcin')

# Preprocess the table
group10 = preprocess_df(group10)

# Fill NAs
group10 = fillNa(group10)

In [5]:
def get_embeddings(path):
#Load sentences & embeddings from disc
    with open(path, "rb") as fIn:
        stored_data = pickle.load(fIn)
    return stored_data
    
def get_tcin_sentence_map(path):
#Load sentences & embeddings from disc
    with open(path, "rb") as fIn:
        stored_data = pickle.load(fIn)
        df = pd.DataFrame()
        df['tcin'] = stored_data['ids']
        df['sentence'] = stored_data['sentences']
    return df


In [6]:
k=3

# Bi Encoder
multi_lm_embedder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
# Cross encoder 
cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-4-v2', max_length=512)

In [7]:
# Classification models
pm = Mapper(group10)
clf_name_class_name = model_path/'nli-classification-class_name'
clf_name_item_type_name = model_path/'nli-classification-item_type_name'
clf_name_subclass_name = model_path/'nli-classification-subclass_name'
base_clf_class_name = CrossEncoder(clf_name_class_name)
base_clf_item_type_name = CrossEncoder(clf_name_item_type_name)
base_clf_subclass_name = CrossEncoder(clf_name_subclass_name)
clf_class_name = Classifier(base_clf_class_name, pm, 'class_name', threshold=0.98)
clf_item_type_name = Classifier(base_clf_item_type_name, pm, 'item_type_name', threshold=0.99)
clf_subclass_name = Classifier(base_clf_subclass_name, pm, 'subclass_name', threshold=0.99)

In [8]:
multi_lm_embeddings = get_embeddings(data_path/'hier_embeddings.pkl')

In [9]:
filtered_products = list(group10[~pd.isnull(group10['image'])]['tcin'].values)

# Bi encoder without classifier
multi_lm = TransformerRanker(model=multi_lm_embedder, max_rank=k, 
                            filtered_products=filtered_products)
multi_lm.load_embeddings(multi_lm_embeddings)

# Bi encoder with classifier
multi_lm_clf = TransformerRanker(model=multi_lm_embedder,
                                max_rank=k, clf=[clf_class_name, 
                                                clf_item_type_name,
                                                clf_subclass_name],
                                filtered_products=filtered_products)
multi_lm_clf.load_embeddings(multi_lm_embeddings)

# Cross encoder with classifier
cr_multi_lm_clf = CrossEncoderRanker(bi_model=multi_lm_clf, 
                        cross_model=cross_encoder_model, 
                        tcin_sentence_map=get_tcin_sentence_map(data_path/'hier_embeddings.pkl'),
                        cross_rank=k,
                        bi_rank=30,
                        mapper=pm,
                        weights=True)  

# Cross encoder without classifier
cr_multi_lm = CrossEncoderRanker(bi_model=multi_lm, 
                        cross_model=cross_encoder_model, 
                        tcin_sentence_map=get_tcin_sentence_map(data_path/'hier_embeddings.pkl'),
                        cross_rank=k,
                        bi_rank=30,
                        mapper=pm,
                        weights=True)  

In [10]:
# Load BM25 (For ingredients)
bm25 = joblib.load(model_path/'bm25_no_description')

In [11]:
rc = RankerCombination([bm25, multi_lm_clf], [0.4, 0.6], max_rank=10)
l2_ranker = RankerPipeline([rc, cr_multi_lm], [10, k])

In [12]:
# Get list of preprocessed product titles
product_titles = group10['title'].str.lower().values
pm = Mapper(group10)
dp = DisplayProducts(ranker=l2_ranker, mapper=pm)

In [13]:
def recipe_load(n):
    dict_ingredients = {'id_recipe':[],
                        'ingredient': [],
                        'unit': [],
                        'quantity': []
                        }
    ingredients_list = []
    for i in range(0,n):
        title = recipe[i]['title']
        id = recipe[i]['id']
#         print(f'Recipe: {title}')
        unit_regex = r" ounces | ounce | oz | teaspoon | grams | teaspoons | cups | cup | tablespoon | tablespoons | tbsp | tsp | can | lb | pound | count | ml | pinch | pounds "

        for lis in recipe[i]['ingredients']:
            for key, val in lis.items():
                
                ingredients_list.append(val)
                rem = re.sub("[\(\[].*?[\)\]]", "", val)
                rem = re.sub(' c. ', ' cup ', rem)
                rem = re.sub("[.]", "", rem)
                rem = re.sub("packages", "count", rem)
                rem = re.sub("cloves", "clove", rem)
                
                #multiple ingredients separated by 'or' - return first one wth unit
                if ' or ' in rem:
                    multi_ing = re.split(' or ', rem)
#                     print(multi_ing)
                    for j in range(len(multi_ing)):
                        u = re.findall(unit_regex, multi_ing[j],  flags=re.I)
                        if len(u)>0:
                            rem = multi_ing[j]
                            break
                            
                # If ingredient read
                if rem != '':
                    u = re.findall(unit_regex, rem,  flags=re.I)
                    #if no unit read

                    if len(u) == 0:
                        qty = re.split(' ', rem)
                        qty_list = re.findall('[0-9/]+', qty[0])

                        if len(qty_list) != 0:

                            dict_ingredients['quantity'].append(qty_list[-1])
                            dict_ingredients['unit'].append('count')
                            j = ' '.join(i for i in qty[1:])
                            dict_ingredients['ingredient'].append(j.split(',')[0].strip())

                        else:
                            dict_ingredients['quantity'].append(np.nan)
                            dict_ingredients['unit'].append('')
                            j = ' '.join(i for i in qty)
                            dict_ingredients['ingredient'].append(j.split(',')[0].strip())
                    # If unit read
                    else:
                        qty = re.split(unit_regex, rem,  flags=re.I)[0].strip()
                        if 'to' in qty:
                            qty = re.split('to', qty,  flags=re.I)[1].strip()
                        elif 'about' in qty:
                            qty = re.split('about', qty,  flags=re.I)[1].strip()
                        qty_list = re.findall('[0-9/]+', qty)

                        # If single quantity parsed
                        if len(qty_list) == 1:
                            dict_ingredients['unit'].append(u[0])
                            dict_ingredients['quantity'].append(qty_list[0])
                            j = re.split(unit_regex, rem,  flags=re.I)[1].split(' or ')[0]
                            dict_ingredients['ingredient'].append(j.split(',')[0].strip())

                        # If multiple quantity values parsed  
                        elif len(qty_list) > 1:
                            # If quantity parsed in fractions
                            if re.findall(r'/', qty):
                                qt = qty_list[0] + '-' + qty_list[1]
                                dict_ingredients['quantity'].append(qt)
                                dict_ingredients['unit'].append(u[0])
                                j = re.split(unit_regex, rem,  flags=re.I)[1].split(' or ')[0]
                                dict_ingredients['ingredient'].append(j.split(',')[0].strip())

                            # If multiple quantities parsed are not in fractions 
                            elif len(qty_list[1])>1:
                                if u[0].strip() in ['grams','gram']:
                                    qt = max(qty_list)
                                    dict_ingredients['quantity'].append(qt)
                                    dict_ingredients['unit'].append(u[0])
                                    j = re.split(unit_regex, rem,  flags=re.I)[1].split(' or ')[0]
                                    dict_ingredients['ingredient'].append(j.split(',')[0].strip())
                                else:
                                    qt = qty_list[0] + '-'+ qty_list[1][0] + '/'+ qty_list[1][1]
                                    dict_ingredients['quantity'].append(qt)
                                    dict_ingredients['unit'].append(u[0])
                                    j = re.split(unit_regex, rem,  flags=re.I)[1].split(' or ')[0]
                                    dict_ingredients['ingredient'].append(j.split(',')[0].strip())
                            elif int(qty_list[1])>1:
                                qt = max(qty_list)
                                dict_ingredients['quantity'].append(qt)
                                dict_ingredients['unit'].append(u[0])
                                j = re.split(unit_regex, rem,  flags=re.I)[1].split(' or ')[0]
                                dict_ingredients['ingredient'].append(j.split(',')[0].strip())
                            else: 
                                dict_ingredients['quantity'].append(qty_list[0])
                                dict_ingredients['unit'].append('count')
                                j = ' '.join(i for i in qty.split(' ')[1:])
                                dict_ingredients['ingredient'].append(j.split(',')[0].strip())

                        else:
                            dict_ingredients['quantity'].append(np.nan)
                            dict_ingredients['unit'].append('')
                            j = ' '.join(i for i in qty)
                            dict_ingredients['ingredient'].append(j.split(',')[0].strip())
                    dict_ingredients['id_recipe'].append(i)
    return dict_ingredients
def convert_fraction(utf):
    if utf is np.nan:
        return utf
    pattern_1 = r'/'
    pattern_2 = r'-'
    utf = re.sub("//", "/", utf)
    
    if '/' in re.findall(pattern_1, utf) and '-' in re.findall(pattern_2, utf):
        first =  re.split(pattern_2, utf) 
        if '/' in re.findall(pattern_1, first[0].strip()):
            g = re.split(pattern_1, first[0].strip())
#             print(utf,g)
            if g[1]=='':
                return np.nan
            g_2 = int(g[0]) / int(g[1])
        else: g_2 = int(first[0].strip())
        d = re.split(pattern_1, first[1].strip())
#         print(utf,d)
        if len(d)==1 and g_2<int(d[0]):
            return int(d[0])
        elif len(d[1])>2 :
            return np.nan
        elif d[1]=='':
            return np.nan
        elif d[0]=='' :
            d[0] = d[1][0:1]
            d[1] = d[1][-1]
        if d[1]=='0':
            return np.nan
#         print(utf,g_2,d)
        number =g_2 + int(d[0]) / int(d[1])
        return number    
    
    elif '/' in re.findall(pattern_1, utf):
        d = re.split(pattern_1, utf)
#         print(utf,d)
        if d[1]=='0' or d[1] =='':
            return np.nan
        elif d[0]=='':
            return int(d[1])
        number = int(d[0]) / int(d[1])
        return number
    elif '-' in re.findall(pattern_2, utf):
        d = re.split(pattern_2, utf)
        if d[1]=='0' or d[1] =='':
            return np.nan
        number = int(d[0]) / int(d[1])
        return number
    return utf

In [28]:
# recipe[38]

In [52]:
dict_ingredients= recipe_load(n=100000)
df_combined_ing = pd.DataFrame.from_dict(dict_ingredients)

In [53]:
df_combined_ing.head(2)

Unnamed: 0,id_recipe,ingredient,unit,quantity
0,0,penne,ounces,6
1,0,Beechers Flagship Cheese Sauce,cups,2


In [54]:
df_combined_ing = df_combined_ing[df_combined_ing['quantity'] !='21/2-3']
df_combined_ing['quantity'] = df_combined_ing['quantity'].apply(convert_fraction)

In [55]:
ing ='butter'
df_combined_ing['ingredient'] = df_combined_ing['ingredient'].str.lower()
df_combined_ing['ingredient'] = df_combined_ing['ingredient'].str.strip()
df_combined_ing = df_combined_ing[df_combined_ing['ingredient']==ing]

In [56]:
df_combined_ing.shape

(18958, 4)

In [57]:
# df_combined_ing[df_combined_ing['unit']=='']

In [58]:
qty = Qty_normal_map(op_file_path= op_file_path, data=group10)

#normalise units
combined_ingredient_df=qty.normalize_units(df_combined_ing)

#using density info calculating required ounces for each ingredient
final_df = qty.req_oz_recipe(combined_ingredient_df)

  combined_ingredient_df['standard_vol'],combined_ingredient_df['standard_weight_gm'],\


In [59]:
df_combined_ing[(df_combined_ing['quantity']>10) & ((df_combined_ing['normalized_unit']=='lb') | (df_combined_ing['normalized_unit']=='cup'))]

Unnamed: 0,id_recipe,ingredient,unit,quantity,normalized_unit,Volume_in_ml,tuple,standard_vol,standard_weight_gm,standard_unit
33,3,butter,cup,14.0,cup,3150.0,"(1.0, 16.0, tablespoon)",1.0,16.0,tablespoon
62,7,butter,cup,12.0,cup,2700.0,"(1.0, 16.0, tablespoon)",1.0,16.0,tablespoon
111,13,butter,cup,14.0,cup,3150.0,"(1.0, 16.0, tablespoon)",1.0,16.0,tablespoon
362,38,butter,lb,14.5,lb,6960.0,"(1.0, 16.0, tablespoon)",1.0,16.0,tablespoon
458,49,butter,cup,12.0,cup,2700.0,"(1.0, 16.0, tablespoon)",1.0,16.0,tablespoon
...,...,...,...,...,...,...,...,...,...,...
928927,99911,butter,cup,13.0,cup,2925.0,"(1.0, 16.0, tablespoon)",1.0,16.0,tablespoon
929117,99933,butter,cup,12.0,cup,2700.0,"(1.0, 16.0, tablespoon)",1.0,16.0,tablespoon
929126,99934,butter,cup,12.0,cup,2700.0,"(1.0, 16.0, tablespoon)",1.0,16.0,tablespoon
929537,99979,butter,cup,12.0,cup,2700.0,"(1.0, 16.0, tablespoon)",1.0,16.0,tablespoon


In [60]:
final_df.head(3)

Unnamed: 0,id_recipe,ingredient,unit,quantity,normalized_unit,Volume_in_ml,standard_vol,standard_weight_gm,standard_unit,req_oz
33,3,butter,cup,14.0,cup,3150.0,1.0,16.0,tablespoon,118.519
62,7,butter,cup,12.0,cup,2700.0,1.0,16.0,tablespoon,101.587
111,13,butter,cup,14.0,cup,3150.0,1.0,16.0,tablespoon,118.519


In [61]:
final_df.shape

(18958, 10)

In [62]:
# Preprocess recipe ingredients
final_df['ingredient'] = preprocess(final_df['ingredient'])
recipe_ingredients=[]
recipe_ingredients.append(list(final_df['ingredient'].values)[0])

# Ranked list of product tcin matches for each ingredient - Returns a list of lists 
ranked_match = l2_ranker.rank_products_recipe(recipe_ingredients)

  scores = torch.nn.functional.softmax(torch.tensor(scores)).numpy()


In [63]:

    def match_ranked_ingredients(k, ranked_match, final_df, recipe_ingredients):

        rslt_df = group10[['title', 'tcin', 'short_desc','price','net_content_quantity_unit_of_measure', 'net_content_quantity_value', 'package_weight_unit_of_measure','package_weight']].copy()
        final_rslt_df=pd.DataFrame()

        for i in range(len(ranked_match)):
            rslt_inter = rslt_df.loc[group10['tcin'].isin(ranked_match[i])].copy() 
            ing = recipe_ingredients[i]
            for n in range(0,k):
                for j, row in rslt_inter.iterrows():
                    if row.tcin == ranked_match[i][n] :
                        rslt_inter.loc[j,'rank']=n+1
                        rslt_inter.loc[j,'ingredient']=ing
                        break
               
            rslt_inter_n=rslt_inter.sort_values('rank')[0:k] 
            final_rslt_df= pd.concat([final_rslt_df,rslt_inter_n], ignore_index=True)

        join_df = pd.merge(final_rslt_df, final_df, how = 'left', on = 'ingredient')
        return join_df

In [64]:
# #matching target database with recipe 
join_df = match_ranked_ingredients(k, ranked_match , final_df, recipe_ingredients)
join_df = join_df[join_df['rank']==1]
# #calculating recommended quantity
rec_df = qty.recommended_quantity(join_df)
# df = rec_df.sort_values(by=['id_recipe', 'recommended_qty', 'price']).copy()

In [65]:
rec_df[(rec_df['recommended_qty']> 1) & (rec_df['quantity']>10)]

Unnamed: 0,title,tcin,short_desc,price,net_content_quantity_unit_of_measure,net_content_quantity_value,package_weight_unit_of_measure,package_weight,rank,ingredient,...,unit,quantity,normalized_unit,Volume_in_ml,standard_vol,standard_weight_gm,standard_unit,req_oz,product_qty_oz_ct,recommended_qty
0,Salted Butter - 1lb - Good & Gather™,80796339,GG BUTTER SALTED BUTTER 1LB,3.89,OUNCE,16.0,POUND,1.0,1.0,butter,...,cup,14.0,cup,3150.0,1.0,16.0,tablespoon,118.519,16.0,8.0
1,Salted Butter - 1lb - Good & Gather™,80796339,GG BUTTER SALTED BUTTER 1LB,3.89,OUNCE,16.0,POUND,1.0,1.0,butter,...,cup,12.0,cup,2700.0,1.0,16.0,tablespoon,101.587,16.0,7.0
2,Salted Butter - 1lb - Good & Gather™,80796339,GG BUTTER SALTED BUTTER 1LB,3.89,OUNCE,16.0,POUND,1.0,1.0,butter,...,cup,14.0,cup,3150.0,1.0,16.0,tablespoon,118.519,16.0,8.0
7,Salted Butter - 1lb - Good & Gather™,80796339,GG BUTTER SALTED BUTTER 1LB,3.89,OUNCE,16.0,POUND,1.0,1.0,butter,...,lb,14.5,lb,6960.0,1.0,16.0,tablespoon,232.000,16.0,15.0
10,Salted Butter - 1lb - Good & Gather™,80796339,GG BUTTER SALTED BUTTER 1LB,3.89,OUNCE,16.0,POUND,1.0,1.0,butter,...,cup,12.0,cup,2700.0,1.0,16.0,tablespoon,101.587,16.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18937,Salted Butter - 1lb - Good & Gather™,80796339,GG BUTTER SALTED BUTTER 1LB,3.89,OUNCE,16.0,POUND,1.0,1.0,butter,...,cup,13.0,cup,2925.0,1.0,16.0,tablespoon,110.053,16.0,7.0
18942,Salted Butter - 1lb - Good & Gather™,80796339,GG BUTTER SALTED BUTTER 1LB,3.89,OUNCE,16.0,POUND,1.0,1.0,butter,...,cup,12.0,cup,2700.0,1.0,16.0,tablespoon,101.587,16.0,7.0
18943,Salted Butter - 1lb - Good & Gather™,80796339,GG BUTTER SALTED BUTTER 1LB,3.89,OUNCE,16.0,POUND,1.0,1.0,butter,...,cup,12.0,cup,2700.0,1.0,16.0,tablespoon,101.587,16.0,7.0
18955,Salted Butter - 1lb - Good & Gather™,80796339,GG BUTTER SALTED BUTTER 1LB,3.89,OUNCE,16.0,POUND,1.0,1.0,butter,...,cup,12.0,cup,2700.0,1.0,16.0,tablespoon,101.587,16.0,7.0


In [66]:
# salt - Out of 32506 recipes - 9338 have incorrect qts due to recipe text fault - 63 recommend wrong
# - 4116 mentions only salt so 1 packet is recommended 
# df_combined_ing[(df_combined_ing['quantity']>10) & (df_combined_ing['normalized_unit']=='cup')]

# sugar - Out of 19983 recipes - 3510 have incorrect qts due to recipe text fault 
# 209 mentions only sugar so 1 packet is recommended
# df_combined_ing[(df_combined_ing['quantity']>10) & (df_combined_ing['normalized_unit']=='cup')]

# butter - Out of 18958 recipes - 4083 have incorrect qts due to recipe text fault - 367 mentions only sugar so 1 packet is recommended
# df_combined_ing[(df_combined_ing['quantity']>10) & ((df_combined_ing['normalized_unit']=='lb') | (df_combined_ing['normalized_unit']=='cup'))]

In [67]:
dd = pd.DataFrame([['salt', 32506, 32443],['sugar',19983,16473],['butter',18958,14875]], columns=['ingredient','total_recipes','correct_qty'] )
dd['perc_correct']=(dd['correct_qty']/dd['total_recipes'])*100

In [68]:
dd

Unnamed: 0,ingredient,total_recipes,correct_qty,perc_correct
0,salt,32506,32443,99.80619
1,sugar,19983,16473,82.43507
2,butter,18958,14875,78.462918


In [51]:
# 32506-9338
19983-3510

16473

In [23]:
#fine

#4567
#7609
#32659
#4623
#17864
#7124
#87364

# Both
# 82314
# 24513

In [None]:
# problems

# 83468 - only first matches correct
# 83642 - no dairy free
# 7362 - sliced pepporoni
# 21312 - 