In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json

# Final Ingredients Corrections

Go through our final recipes dataset with tagged ingredients and make any last corrections.

In [85]:
recipes = []
with open('../../VectorDB/recipes-vdb/app/data/final_combined_standardized_ingredients_tag.jsonl', 'r') as f:
    for line in f:
        recipes.append(json.loads(line))
df = pd.DataFrame(recipes)
df.head()

Unnamed: 0,id,title,cuisines,course,methods,diet_tags_llm,diet_flags,diet_violations,flavors,summary,...,source_id,atwater_error_pct,data_source,merge_ingredients,original_macros_per_serving,normalized_atwater_error_pct,error_improvement,neighbors_count,avg_neighbor_similarity,confidence
0,99d4a0f689,Summertime Stuffed and Grilled Chicken Breasts,[global],main,[grill],[high-protein],"{'vegetarian': False, 'vegan': False, 'pescata...","{'vegetarian': ['chicken, broiler or fryers, b...","[smoky, herby, savory, cheesy]",Grilled chicken breasts stuffed with sun-dried...,...,99d4a0f689,0.509102,golden_set,"[{'o': 'feta cheese', 'n': 'feta cheese', 'q':...",,,,,,
1,99df626ce3,Baby Vegetable Soup,[global],soup,"[saute, stir-fry]","[low-fat, gluten-free]","{'vegetarian': True, 'vegan': True, 'pescatari...","{'vegetarian': [], 'vegan': [], 'pescatarian':...","[mild, vegetal]","Simple, comforting soup made with a variety of...",...,99df626ce3,2.845544,golden_set,[{'o': 'composite household vegetable shorteni...,,,,,,
2,99f60eccec,Mushroom Salad II,[global],salad,[],[gluten-free],"{'vegetarian': True, 'vegan': False, 'pescatar...","{'vegetarian': [], 'vegan': ['cheese, parmesan...","[tangy, herby, savory, umami]","Raw mushroom salad with celery, parsley, parme...",...,99f60eccec,0.807808,golden_set,"[{'o': 'raw white mushrooms', 'n': 'mushroom',...",,,,,,
3,9a1f55a2f6,Caramelized Bacon,[global],side,[bake],[],"{'vegetarian': False, 'vegan': False, 'pescata...","{'vegetarian': ['bacon, meatless'], 'vegan': [...","[sweet, caramel, smoky]","Crispy, sweet, and smoky meatless bacon made b...",...,9a1f55a2f6,2.81198,golden_set,"[{'o': 'brown sugars', 'n': 'brown sugar', 'q'...",,,,,,
4,9a459c2e73,Honey Cranberry Butter,[global],breakfast,[],"[dairy-free, low-fat]","{'vegetarian': True, 'vegan': False, 'pescatar...","{'vegetarian': [], 'vegan': ['butter, without ...","[sweet, honey, fruity, nutty]","Whipped butter with dried cranberries, honey, ...",...,9a459c2e73,1.600447,golden_set,"[{'o': 'without salt butter', 'n': 'butter', '...",,,,,,


In [86]:
# How many rows are there?
print("Dataset contains {} individual recipes (without de-duping)".format(len(df)))

Dataset contains 13437 individual recipes (without de-duping)


In [87]:
recipe_id = "epi_1451"
# show recipe with this id
df[df["id"] == recipe_id]

Unnamed: 0,id,title,cuisines,course,methods,diet_tags_llm,diet_flags,diet_violations,flavors,summary,...,source_id,atwater_error_pct,data_source,merge_ingredients,original_macros_per_serving,normalized_atwater_error_pct,error_improvement,neighbors_count,avg_neighbor_similarity,confidence
5348,epi_1451,Manhattan Clam Chowder,[american],soup,[saute],[gluten-free],"{'vegetarian': False, 'vegan': False, 'pescata...","{'vegetarian': ['2 bacon slices, cut into 1/2-...","[briny, savory, oniony, tomatoey]","Hearty, comforting New England-style clam chow...",...,epi_1451,0.0,epicurious,"[{'o': 'bacon slices, cut into 1/2-inch square...",,,,,,


In [88]:
# Deduplicate recipes based on 'id' field
# Find all recipes with duplicate ids
duplicate_ids = df[df.duplicated(subset=['id'], keep=False)]['id'].unique()
print("Found {} duplicate recipe ids".format(len(duplicate_ids)))   

Found 0 duplicate recipe ids


In [89]:
# Loop through all units and find all unique units
# Remove any rows with quantity of 0 or None or where 
units = set()
ingredients = set()

recipes_cleaned = []
removed_id_list = []
for index, row in df.iterrows():

    if len(row["units"]):
        keep_recipe = True
        for unit,quantity,ingredient in zip(row["units"],row["quantities"],row["ingredients_raw"]):
            # print(unit,quantity,ingredient)
            unit = unit["text"].lower().strip()
            quantity = quantity["text"]
            ingredient = " ".join(ingredient.lower().strip().split(",")[::-1])

            if quantity != 0.0 and quantity != None and unit != "N/A" and unit != "" and unit != "None" and unit != None:
                # See if unit is already in set
                if unit not in units:
                    units.add(unit)
                    ingredients.add(f"{quantity} {unit} of {ingredient}")
            else:
                keep_recipe = False
        if keep_recipe:
            recipes_cleaned.append(row)
    else:
        keep_recipe = True
        for mi in row["merge_ingredients"]:
            unit = mi.get("u","")
            quantity = mi.get("q","")
            ingredient = mi.get("o","").lower().strip()

            if quantity != 0.0 and quantity != None and unit != "N/A" and unit != "" and unit != "None" and unit != None:
                if unit not in units:
                    units.add(unit)
                    ingredients.add(f"{quantity} {unit} of {ingredient}")
            else:
                keep_recipe = False
        if keep_recipe:
            recipes_cleaned.append(row)

# Convert recipes_cleaned to dataframe
recipes_cleaned = pd.DataFrame(recipes_cleaned)

print("After cleaning, dataset contains {} individual recipes".format(len(recipes_cleaned)))


After cleaning, dataset contains 11922 individual recipes


In [90]:
recipe_id = "epi_1451"
# show recipe with this id
recipes_cleaned[recipes_cleaned["id"] == recipe_id]

Unnamed: 0,id,title,cuisines,course,methods,diet_tags_llm,diet_flags,diet_violations,flavors,summary,...,source_id,atwater_error_pct,data_source,merge_ingredients,original_macros_per_serving,normalized_atwater_error_pct,error_improvement,neighbors_count,avg_neighbor_similarity,confidence
5348,epi_1451,Manhattan Clam Chowder,[american],soup,[saute],[gluten-free],"{'vegetarian': False, 'vegan': False, 'pescata...","{'vegetarian': ['2 bacon slices, cut into 1/2-...","[briny, savory, oniony, tomatoey]","Hearty, comforting New England-style clam chow...",...,epi_1451,0.0,epicurious,"[{'o': 'bacon slices, cut into 1/2-inch square...",,,,,,


In [91]:
recipes_cleaned.head()

Unnamed: 0,id,title,cuisines,course,methods,diet_tags_llm,diet_flags,diet_violations,flavors,summary,...,source_id,atwater_error_pct,data_source,merge_ingredients,original_macros_per_serving,normalized_atwater_error_pct,error_improvement,neighbors_count,avg_neighbor_similarity,confidence
0,99d4a0f689,Summertime Stuffed and Grilled Chicken Breasts,[global],main,[grill],[high-protein],"{'vegetarian': False, 'vegan': False, 'pescata...","{'vegetarian': ['chicken, broiler or fryers, b...","[smoky, herby, savory, cheesy]",Grilled chicken breasts stuffed with sun-dried...,...,99d4a0f689,0.509102,golden_set,"[{'o': 'feta cheese', 'n': 'feta cheese', 'q':...",,,,,,
1,99df626ce3,Baby Vegetable Soup,[global],soup,"[saute, stir-fry]","[low-fat, gluten-free]","{'vegetarian': True, 'vegan': True, 'pescatari...","{'vegetarian': [], 'vegan': [], 'pescatarian':...","[mild, vegetal]","Simple, comforting soup made with a variety of...",...,99df626ce3,2.845544,golden_set,[{'o': 'composite household vegetable shorteni...,,,,,,
2,99f60eccec,Mushroom Salad II,[global],salad,[],[gluten-free],"{'vegetarian': True, 'vegan': False, 'pescatar...","{'vegetarian': [], 'vegan': ['cheese, parmesan...","[tangy, herby, savory, umami]","Raw mushroom salad with celery, parsley, parme...",...,99f60eccec,0.807808,golden_set,"[{'o': 'raw white mushrooms', 'n': 'mushroom',...",,,,,,
3,9a1f55a2f6,Caramelized Bacon,[global],side,[bake],[],"{'vegetarian': False, 'vegan': False, 'pescata...","{'vegetarian': ['bacon, meatless'], 'vegan': [...","[sweet, caramel, smoky]","Crispy, sweet, and smoky meatless bacon made b...",...,9a1f55a2f6,2.81198,golden_set,"[{'o': 'brown sugars', 'n': 'brown sugar', 'q'...",,,,,,
4,9a459c2e73,Honey Cranberry Butter,[global],breakfast,[],"[dairy-free, low-fat]","{'vegetarian': True, 'vegan': False, 'pescatar...","{'vegetarian': [], 'vegan': ['butter, without ...","[sweet, honey, fruity, nutty]","Whipped butter with dried cranberries, honey, ...",...,9a459c2e73,1.600447,golden_set,"[{'o': 'without salt butter', 'n': 'butter', '...",,,,,,


In [92]:
units

{'5-ounce 1/2-inch-thick',
 'Belgian endives',
 'Red Delicious',
 'Tbsp',
 'Tbsp.',
 'accompaniment',
 'bag',
 'bags',
 'baguette',
 'ball',
 'balls',
 'basket',
 'baskets',
 'batch',
 'bean',
 'bell pepper',
 'berries',
 'black pepper',
 'block',
 'bottle',
 'bottled',
 'bottles',
 'box',
 'boxes',
 'branch',
 'branches',
 'breast',
 'bulb',
 'bulbs',
 'bunch',
 'bunches',
 'bundle',
 'butter',
 'cabbage',
 'can',
 'canned',
 'canola oil',
 'cans',
 'carrot',
 'carrots',
 'chile',
 'chiles',
 'chili',
 'chops',
 'chunk',
 'clove',
 'cloves',
 'cluster',
 'coarse',
 'container',
 'containers',
 'count',
 'cracker',
 'crown roast',
 'crust',
 'cube',
 'cubes',
 'cucumber',
 'cup',
 'cups',
 'dash',
 'dashes',
 'diameter',
 'dish',
 'disk',
 'dozen',
 'dried',
 'dried chorizos',
 'drop',
 'drops',
 'each',
 'ear',
 'ears',
 'envelope',
 'envelopes',
 'equipment',
 'filet',
 'filets',
 'fillet',
 'fillets',
 'firm-ripe',
 'fl oz',
 'fl. oz',
 'flat',
 'flour',
 'fluid ounce',
 'g',
 'gall

The unit "dozen" confused the LLM for some ingredients so the quantity was multiplied by 12. Let's convert all "dozen" units to "count" and check the quantities. If the quantity is a multiple of 12, then leave as is, if not multiply by 12 to convert to "count".

In [93]:
# Loop through recipes_cleaned and find all ingredients where the unit or "u" is "dozen"
dozen_ingredients = []
for index, row in recipes_cleaned.iterrows():
    if len(row["units"]):
        units = []
        quantities = []
        ingredients_raw = []
        merge_ingredients = []
        unit_change_flag = False
        for unit,quantity,ingredient,mi in zip(row["units"],row["quantities"],row["ingredients_raw"], row["merge_ingredients"]):
            unit = unit["text"].lower().strip()
            quantity = quantity["text"]
            ingredient = " ".join(ingredient.lower().strip().split(",")[::-1])
            if unit == "dozen":
                unit_change_flag = True
                # Change unit to "count" and multiply quantity by 12 if not already multiple of 12
                if quantity < 12 and quantity % 12:
                    quantity *= 12
                unit = "count"
                mi["u"] = "count"
                mi["q"] = quantity
                dozen_ingredients.append((row["id"],ingredient,unit,quantity,mi))
            units.append(unit)
            quantities.append(quantity)
            ingredients_raw.append(ingredient)
            merge_ingredients.append(mi)
        if unit_change_flag:
            row["units"] = units
            row["quantities"] = quantities
            row["ingredients_raw"] = ingredients_raw
            row["merge_ingredients"] = merge_ingredients
    else:
        merge_ingredients = []
        unit_change_flag = False
        for mi, ingredient_raw in zip(row["merge_ingredients"],row["ingredients_raw"]):
            # print(mi)
            unit = mi.get("u","")
            if unit == None:
                unit = ""
            else:
                unit = unit.lower().strip()
            quantity = mi.get("q","")
            ingredient = mi.get("o","").lower().strip()
            if unit == "dozen":
                unit_change_flag = True
                # Change unit to "count" and multiply quantity by 12 if not already multiple of 12
                if quantity < 12 and quantity % 12:
                    quantity *= 12
                unit = "count"
                mi["u"] = "count"
                mi["q"] = quantity
                dozen_ingredients.append((row["id"],ingredient_raw,unit,quantity,mi))
            merge_ingredients.append(mi)
        if unit_change_flag:
            row["merge_ingredients"] = merge_ingredients

In [94]:
len(dozen_ingredients)

20

In [95]:
# Deduplicate recipes based on 'id' field
# Find all recipes with duplicate ids
duplicate_ids = df[df.duplicated(subset=['id'], keep=False)]['id'].unique()
print("Found {} duplicate recipe ids".format(len(duplicate_ids)))  

Found 0 duplicate recipe ids


In [96]:
dozen_ingredients

[('epi_1451',
  '1 1/2 dozen small hard-shelled clams (1 1/2 to 2 inches in diameter; 2 pounds total), scrubbed well',
  'count',
  18.0,
  {'o': 'small hard-shelled clams',
   'n': 'clam',
   'q': 18.0,
   'u': 'count',
   'f': 'count',
   'c': 'Meat and Fish',
   'm': 'clam'}),
 ('epi_3393',
  '3 dozen small hard-shelled clams (2 to 2 1/2 inches in diameter; 3 1/2 lb total), scrubbed well',
  'count',
  36,
  {'o': '3 dozen small hard-shelled clams (2 to 2 1/2 inches in diameter; 3 1/2 lb total), scrubbed well',
   'n': 'clam',
   'q': 36,
   'u': 'count',
   'f': 'count',
   'c': 'Meat and Fish',
   'm': 'clam'}),
 ('epi_4794',
  '1 1/2 dozen small hard-shelled clams, shucked, reserving 1/2 cup liquor, and chopped',
  'count',
  18,
  {'o': '1 1/2 dozen small hard-shelled clams, shucked, reserving 1/2 cup liquor, and chopped',
   'n': 'clam',
   'q': 18,
   'u': 'count',
   'f': 'count',
   'c': 'Meat and Fish',
   'm': 'clam'}),
 ('epi_4804',
  '3 dozen mussels, scrubbed, debearded

In [101]:
for i in range(len(dozen_ingredients)):
    print(f"Recipe ID: {dozen_ingredients[i][0]}")
    print(f"Ingredient Raw: {dozen_ingredients[i][1]}")
    print(f"New Unit: {dozen_ingredients[i][2]}")
    print(f"New Quantity: {dozen_ingredients[i][3]}")
    print(f"Merge Ingredient Entry: {dozen_ingredients[i][4]}")
    print("-----")
    r = recipes_cleaned[recipes_cleaned["id"] == dozen_ingredients[i][0]]
    print("Ingredients Raw:")
    for ingredient in r["ingredients_raw"].values[0]:
        print(ingredient)
    print("Units:")
    for unit in r["units"].values[0]:
        print(unit)
    print("Quantities:")
    for quantity in r["quantities"].values[0]:
        print(quantity)
    print("Merge Ingredients:")
    for mi in r["merge_ingredients"].values[0]:
        print(mi)
    print("==============\n")
# recipe_id = dozen_ingredients[2][0]
# # show recipe with this id
# r = recipes_cleaned[recipes_cleaned["id"] == recipe_id]
# r

Recipe ID: epi_1451
Ingredient Raw: 1 1/2 dozen small hard-shelled clams (1 1/2 to 2 inches in diameter; 2 pounds total), scrubbed well
New Unit: count
New Quantity: 18.0
Merge Ingredient Entry: {'o': 'small hard-shelled clams', 'n': 'clam', 'q': 18.0, 'u': 'count', 'f': 'count', 'c': 'Meat and Fish', 'm': 'clam'}
-----
Ingredients Raw:
2 bacon slices, cut into 1/2-inch squares
1/3 cup chopped onion
3 tablespoons diced (1/3 inch) green bell pepper
3 tablespoons diced (1/3 inch) celery
2/3 cup diced (1/3 inch) peeled boiling potato (1 small)
1 (8-oz) bottle clam juice
1 cup canned diced tomatoes (8 oz), including juice
1 1/2 dozen small hard-shelled clams (1 1/2 to 2 inches in diameter; 2 pounds total), scrubbed well
2 tablespoons chopped fresh flat-leaf parsley
Units:
Quantities:
Merge Ingredients:
{'o': 'bacon slices, cut into 1/2-inch squares', 'n': 'bacon', 'q': 2, 'u': 'slices', 'f': 'count', 'c': 'Meat and Fish', 'm': 'bacon'}
{'o': 'chopped onion', 'n': 'onion', 'q': 0.33, 'u': '

In [102]:
# Save cleaned dataframe to jsonl
with open('../../VectorDB/recipes-vdb/app/data/final_combined_standardized_ingredients_tag_cleaned.jsonl', 'w') as f:
    for index, row in recipes_cleaned.iterrows():
        f.write(json.dumps(row.to_dict()) + '\n')

In [103]:
# load cleaned dataframe back in to verify
recipes_loaded = []
with open('../../VectorDB/recipes-vdb/app/data/final_combined_standardized_ingredients_tag_cleaned.jsonl', 'r') as f:
    for line in f:
        recipes_loaded.append(json.loads(line))
recipes_loaded = pd.DataFrame(recipes_loaded)
print("Loaded cleaned dataset contains {} individual recipes".format(len(recipes_loaded)))

Loaded cleaned dataset contains 11922 individual recipes
