In [1]:
import requests
import json
import re
import pandas as pd 
import numpy as np

The steps followed here are:

1) Load 1m recipe
2) Standardise the units
3) Load density info
4) Calculate the required amount of each ingredient

In [2]:
from zipfile import ZipFile
with ZipFile('../../data/layer1.zip') as myzip:
    with myzip.open(myzip.namelist()[0]) as myfile:
        recipe_str = myfile.read()
recipe = json.loads(recipe_str)

In [3]:
# filepath = "/Users/chahaksethi/Desktop/Target/data/1m_recipe/recipe1M_layers/layer1.json"
# op_file_path = "./Data/ing_desnsity.csv"
# with open(filepath) as json_data:
#     recipe = json.load(json_data)

In [4]:
def recipe_load(n):
    dict_ingredients = { 'ingredient' :[],
                    'unit': [],
                    'quantity':[]   
    }
    ingredients_list=[]
    for i in range(0,n):
        title = recipe[i]['title']
        id = recipe[i]['id']
        
        for lis in recipe[i]['ingredients']:
            for key, val in lis.items():   
                ingredients_list.append(val)
                rem = re.sub("[\(\[].*?[\)\]]", "", val)
                rem = re.sub(' c. ', ' cup ', rem)
                rem = re.sub("[.]", "", rem)
                if rem !='':
                    u =re.findall(r" ounces | ounce | teaspoon | cups | cup | tablespoon | tbsp | tsp | can ", rem)
                    if len(u) == 0:
                        qty = re.split(' ', rem)
                        qty_list = re.findall('[0-9/]+', qty[0])
                        if len(qty_list) !=0:
                            
                            dict_ingredients['quantity'].append(qty_list[-1])
                            dict_ingredients['unit'].append('count')
                            j = ' '.join(i for i in qty[1:])
                            dict_ingredients['ingredient'].append(j.split(',')[0].strip())
                        else:  
                            dict_ingredients['quantity'].append(np.nan)
                            dict_ingredients['unit'].append('')
                            j= ' '.join(i for i in qty)
                            dict_ingredients['ingredient'].append(j.split(',')[0].strip())
                        
                    else:
                        qty = re.split(r" ounces | ounce | teaspoon | cups | cup | tablespoon | tbsp | tsp | can ", rem)[0].strip()  
                        qty_list = re.findall('[0-9/]+', qty)
                        
                        if len(qty_list) !=0:
                            dict_ingredients['unit'].append(u[0])
                            dict_ingredients['quantity'].append(qty_list[-1])
                            j = re.split(r"ounces | ounce | teaspoon | cups | cup | tablespoon | tbsp | tsp | can ", rem)[1].strip()
                            dict_ingredients['ingredient'].append(j.split(',')[0].strip())
                        else:
                            dict_ingredients['quantity'].append(np.nan)
                            dict_ingredients['unit'].append('')
                            j=' '.join(i for i in qty)
                            dict_ingredients['ingredient'].append(j.split(',')[0].strip())
    
    return dict_ingredients

def convert_fraction(utf):
    if utf is np.nan:
        return utf
    pattern = r'/'
    if '/' in re.findall(pattern, utf) :
        d = re.split(pattern, utf)
        number = int(d[0])/ int(d[1]) 
        return number

    return utf


In [5]:

unit_abbreviation = { 'tbsp' : ["tablespoon"],
                      'tsp' : ['teaspoon'],
                     'ml' : ['milliliter'],
                     'cup' : ['cups'],
                     'oz' : ['ounces']
    
}

In [6]:
#Normalizes quantity required
def normalize_units(combined_ingredient_df):
    normalized_units = list()
    m_list=[]
    for ingredient in combined_ingredient_df.iterrows():
        unit = ingredient[1][1].strip()
        normalized_unit = ''
        for key, val in unit_abbreviation.items():
            if unit in val:
                normalized_unit = key
                if normalized_unit == 'cup':
                     m = 225
                elif normalized_unit == 'tsp':
                     m = 5
                elif normalized_unit == 'tbsp':
                     m = 15
                elif normalized_unit == 'ml':
                     m = 1
                elif normalized_unit == 'oz':
                     m = 30
        if normalized_unit == '':
            normalized_unit = unit
            m = 0
        normalized_units.append(normalized_unit)
        m_list.append(m)
    combined_ingredient_df['quantity']= combined_ingredient_df['quantity'].astype(float)
    combined_ingredient_df['normalized_unit'] = normalized_units
    combined_ingredient_df['Volume_in_ml'] = combined_ingredient_df['quantity']*m_list
    return combined_ingredient_df
def search_density(ingredient):
    df = pd.read_csv(op_file_path)
    ing = (df['ingredient']).tolist()
    for n,i in enumerate(ing):
        if ingredient.lower() in i.lower():
            return (df.iloc[n,1], df.iloc[n,2], df.iloc[n,3])    
#     matching = [s for s in ing if lower(ingredient) in lower(s)]
#         if product[1]['ingredient']t.contains(ingredient):
#             return (row.standard_vol, row.standard_weight_gm, row.standard_unit)
    return (None, None, '')
def req_oz_recipe(combined_ingredient_df):
    req_oz=[]

    for index, row in combined_ingredient_df.iterrows():
        if row.normalized_unit=='ounce':
            req_oz.append(row.quantity)
        elif row.normalized_unit=='pound':
            req_oz.append(row.quantity*16)
        elif 'cup' in row.standard_unit.strip():
            req_gm = (row.standard_weight_gm/225)*row.Volume_in_ml
            req_oz.append(req_gm/28.35)
        elif 'tbsp' in row.standard_unit.strip() or 'tablespoon' in row.standard_unit.strip():
            req_gm = (row.standard_weight_gm/15)*row.Volume_in_ml
            req_oz.append(req_gm/28.35)
        elif 'tsp' in row.standard_unit.strip() or 'teaspoon' in row.standard_unit.strip():
            req_gm = (row.standard_weight_gm/5)*row.Volume_in_ml
            req_oz.append(req_gm/28.35)
        else: req_oz.append(0)

    combined_ingredient_df['req_oz']=req_oz
    combined_ingredient_df['req_oz']=np.round(combined_ingredient_df['req_oz'], 3)
    return combined_ingredient_df

def recommended_qty(join_df):
    recommended_qty=[]
    for index, row in join_df.iterrows():
        if row.req_oz > 0 :
            if row.package_weight_unit_of_measure.strip().lower() =='pound':
                pack_oz = row.package_weight * 16
                rec = row.req_oz/pack_oz
                recommended_qty.append(np.ceil(rec))
            elif row.package_weight_unit_of_measure.strip().lower() =='ounce':
                rec = row.req_oz/row.package_weight
                recommended_qty.append(np.ceil(rec))
        else :
            if row.normalized_unit == '':
                if row.net_content_quantity_unit_of_measure.strip().lower() =='dozen':
                    rec = row.quantity/12
                    recommended_qty.append(np.ceil(rec))
                if row.net_content_quantity_unit_of_measure.strip().lower() =='count':
                    rec = row.quantity
                    recommended_qty.append(np.ceil(rec))

            else:    

                recommended_qty.append(0)

    join_df['recommended_qty'] = recommended_qty
    return join_df

In [7]:
dict_ingredients= recipe_load(n=1)
df_combined_ing = pd.DataFrame.from_dict(dict_ingredients)
df_combined_ing

Unnamed: 0,ingredient,unit,quantity
0,penne,ounces,6
1,Beechers Flagship Cheese Sauce,cups,2
2,Cheddar,ounce,1
3,Gruyere cheese,ounce,1
4,chipotle chili powder,teaspoon,1/2
5,unsalted butter,cup,1/4
6,all-purpose flour,cup,1/3
7,milk,cups,3
8,semihard cheese,ounces,14
9,semisoft cheese,ounces,2


In [8]:
df_combined_ing['quantity'] = df_combined_ing['quantity'].apply(convert_fraction)
df_combined_ing['quantity'] = df_combined_ing['quantity'].apply(float)
combined_ingredient_df=normalize_units(df_combined_ing)

In [9]:
# required for multiple recipes - combining qty
combined_ingredient_df = df_combined_ing.groupby(by=['ingredient', 'normalized_unit'], 
                                               as_index = False)\
                                      .agg({'quantity': 'sum', 'Volume_in_ml': 'sum'})

In [10]:
combined_ingredient_df

Unnamed: 0,ingredient,normalized_unit,quantity,Volume_in_ml
0,Beechers Flagship Cheese Sauce,cup,2.0,450.0
1,Cheddar,ounce,1.0,0.0
2,Gruyere cheese,ounce,1.0,0.0
3,all-purpose flour,cup,0.333333,0.0
4,chipotle chili powder,tsp,1.0,5.0
5,garlic powder,tsp,0.125,0.625
6,kosher salt,tsp,0.5,2.5
7,milk,cup,3.0,675.0
8,penne,oz,6.0,180.0
9,semihard cheese,oz,14.0,420.0


In [11]:
from sentence_transformers import SentenceTransformer
from ranker import *
from preprocessor import *
import joblib

In [12]:
# Set input file directory
ip_file_dir = "../../data/"

# Get grocery product hierarchy information
group10 = pd.read_csv(os.path.join(ip_file_dir, 
                                   'group10_header.csv'),
                      sep='\t', 
                      low_memory=False)

# Get scraped information for the above products
products = pd.read_csv(os.path.join(ip_file_dir,
                                    'scraped/products.csv'))

# Merge scraped information into the hierarchy table
group10 = pd.merge(group10, products, 
                   how = 'left', on = 'tcin')

# Preprocess the table
group10 = preprocess_df(group10)

In [13]:
# Get list of preprocessed product titles
product_titles = group10['title_modified'].values
# Preprocess recipe ingredients
recipe_ingredients = preprocess(combined_ingredient_df['ingredient'].values)

In [14]:
lm = SentenceTransformer('all-MiniLM-L6-v2')

In [15]:
tr = TransformerRanker(model=lm, product_ids=group10['tcin'])
product_title_embeddings = joblib.load('../../data/lm_embeddings')
tr.load_embeddings(product_title_embeddings)

In [16]:
# Ranked list of product tcin matches for 1 ingredient - Returns a list
tr.rank_products_ingredient(recipe_ingredients[0])

[53626563,
 14765838,
 12935636,
 13083696,
 80189989,
 14847431,
 14909504,
 80189990,
 53626660,
 12936285,
 52258679,
 47104968,
 50627699,
 13437822,
 14913706,
 52258524,
 12935436,
 14859463,
 80189988,
 53626690,
 14923533,
 13305242,
 82281163,
 50874998,
 12936308,
 13127562,
 14393689,
 12954182,
 50286792,
 15421807,
 81504650,
 47087844,
 12935458,
 52391164,
 52804829,
 79345887,
 13349592,
 82281162,
 81500996,
 79373210,
 39593065,
 81624930,
 14909477,
 13130096,
 14930798,
 46775554,
 14859460,
 53397622,
 13035282,
 16608664,
 47101158,
 81547949,
 50803593,
 12954218,
 15479455,
 15013915,
 16482550,
 79507429,
 52390809,
 12955137,
 77456900,
 78716526,
 13305239,
 82399697,
 14770384,
 47094508,
 14760222,
 82281161,
 13386273,
 82382726,
 14909474,
 78053393,
 53397663,
 12954023,
 82864006,
 15025346,
 52150998,
 78653377,
 54340523,
 14909475,
 53278354,
 49117982,
 13035361,
 12946175,
 52150976,
 13071711,
 53137064,
 12935429,
 47104420,
 13207339,
 54584167,

In [17]:
# Ranked list of product tcin matches for each ingredient - Returns a list of lists 
tr.rank_products_recipe(recipe_ingredients)

[[53626563,
  14765838,
  12935636,
  13083696,
  80189989,
  14847431,
  14909504,
  80189990,
  53626660,
  12936285,
  52258679,
  47104968,
  50627699,
  13437822,
  14913706,
  52258524,
  12935436,
  14859463,
  80189988,
  53626690,
  14923533,
  13305242,
  82281163,
  50874998,
  12936308,
  13127562,
  14393689,
  12954182,
  50286792,
  15421807,
  81504650,
  47087844,
  12935458,
  52391164,
  52804829,
  79345887,
  13349592,
  82281162,
  81500996,
  79373210,
  39593065,
  81624930,
  14909477,
  13130096,
  14930798,
  46775554,
  14859460,
  53397622,
  13035282,
  16608664,
  47101158,
  81547949,
  50803593,
  12954218,
  15479455,
  15013915,
  16482550,
  79507429,
  52390809,
  12955137,
  77456900,
  78716526,
  13305239,
  82399697,
  14770384,
  47094508,
  14760222,
  82281161,
  13386273,
  82382726,
  14909474,
  78053393,
  53397663,
  12954023,
  82864006,
  15025346,
  52150998,
  78653377,
  54340523,
  14909475,
  53278354,
  49117982,
  13035361,
  12

In [14]:
combined_ingredient_df['tuple'] = combined_ingredient_df['ingredient'].apply(search_density)
combined_ingredient_df['standard_vol'],combined_ingredient_df['standard_weight_gm'],\
combined_ingredient_df['standard_unit'] = combined_ingredient_df.tuple.str
combined_ingredient_df

FileNotFoundError: [Errno 2] No such file or directory: './Data/ing_desnsity.csv'

In [None]:
req_oz_recipe(combined_ingredient_df)