In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
raw_recipe_data = pd.read_csv("data/Food Ingredients and Recipe Dataset with Image Name Mapping.csv")
raw_recipe_data = raw_recipe_data.rename(columns={'Unnamed: 0': 'recipe_id'})

In [3]:
def convert_to_float(quantity):
    try:
        # Handle fractions in the format 'a/b' and convert them to float
        if '/' in quantity:
            return float(sum(Fraction(s) for s in quantity.split()))
        # Handle unicode fractions like '½', '¼', etc.
        quantity = quantity.replace('½', '0.5').replace('¼', '0.25').replace('¾', '0.75')
        # Try converting to float (for whole numbers or decimals)
        return float(quantity)
    except (ValueError, TypeError):
        return None  # For quantities like 'Pinch' or None, return None
    
def parse_ingredient(ingredient):
    # Updated regex to ensure we capture units correctly (e.g., avoid splitting 'large' into 'l' and 'arge')
    pattern = r'(?P<quantity>[\d½¼¾\.\-–\s]+|\d+/\d+)?\s*(?P<unit>(?:\b(?:lb\.?|ounce\.?|oz\.?|cup\.?|cups\.?|tsp\.?|tbsp\.?|teaspoons\.?|teaspoon\.?|tablespoons\.?|tablespoon\.?|kg\.?|g\.?|ml\.?|cl\.?|l\.?|quarts\.?|pound\.?)\b)?)\s*(?P<ingredient>.+)?(?:\((?P<range>[^\)]*)\))?'
    # Try to match the full pattern including quantity, unit, ingredient, and ranges in parentheses
    match = re.match(pattern, ingredient.strip(), re.IGNORECASE)
    
    # Extract parsed data
    if match:
        quantity = match.group('quantity').strip() if match.group('quantity') else None
        unit = match.group('unit').strip() if match.group('unit') else None
        ingredient_name = match.group('ingredient').strip() if match and match.group('ingredient') else None
        # Remove any parenthesis and the text inside them from the ingredient name
        ingredient_name = re.sub(r'\(.*?\)', '', ingredient_name).strip() if ingredient_name else None
        
        quantity_range = match.group('range').strip() if match.group('range') else None

        # Post-process quantity to handle dashes
        if quantity and '-' in quantity:
            quantity = quantity.split('-')[0].strip()  # Keep only the part before the dash

        # Combine the range with the quantity if necessary (for cases like '10 to 12 small beets')
        if quantity_range:
            quantity = quantity_range if not quantity else f"{quantity} ({quantity_range})"
        if not ingredient_name:
            # Split the original string and take the last two words
            words = ingredient.strip().split()
            ingredient_name = ' '.join(words[-2:]) if len(words) >= 2 else words[0] if words else None

        quantity = convert_to_float(quantity) if quantity else 1.0
        if unit and unit.endswith('s'):
            unit = unit.rstrip('s').lower()
        return {
            'ingredient': ingredient_name if ingredient_name else '',
            'quantity': quantity if quantity else 1.0,
            'unit': unit if unit else ''
        }

    # Fallback: Just return the ingredient name if parsing fails
    return {
        'ingredient': ingredient.strip() if ingredient.strip() else '',
        'quantity': 1.0,
        'unit': ''
    }
# Function to split the ingredients list and parse each item
def process_ingredients(ingredient_str):
    ingredient_list = ingredient_str[4:].split("\', \'")
    # Split the ingredients by commas    
    # Parse each individual ingredient using the parse_ingredient function
    parsed_ingredients = [parse_ingredient(ing.strip()) for ing in ingredient_list]
    return parsed_ingredients
    
# Apply the function to split and parse each row of ingredients
def set_columns_from_ingredients(df, column_name):
    # Assume process_ingredients returns a list of dictionaries (JSON-like structure) for each row
    df['parsed_ingredients'] = df[column_name].apply(process_ingredients)
    
    exploded_df = df[['recipe_id', 'parsed_ingredients']].explode('parsed_ingredients')
    
    # Normalize the exploded ingredients (JSON-like structure) into a DataFrame
    normalized_data = pd.json_normalize(exploded_df['parsed_ingredients'])
    
    # Add the recipe_id back to the normalized data
    normalized_data['recipe_id'] = exploded_df['recipe_id'].values
    
    return normalized_data

# Parse dataframe into a table with PK recipe_ingredient_id with columns ingredient, unit, recipe_id
recipe_ingredients_df = set_columns_from_ingredients(raw_recipe_data.copy(), 'Cleaned_Ingredients')
recipe_ingredients_df['recipe_ingredient_id'] = recipe_ingredients_df.index
recipe_ingredients_df.to_csv('data/recipe_ingredients.csv', index=False)

# Select only useful information from raw recipe data
raw_recipe_data.drop(columns=['Ingredients'])
recipe_df = raw_recipe_data.rename(columns={'Cleaned_Ingredients': 'IngredientsText'})
recipe_df.to_csv('data/recipe.csv', index=False)


In [4]:
from sentence_transformers import SentenceTransformer, util

nutrition_data = pd.read_csv("data/food.csv")
# Load the pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Small and fast transformer

# Your categories (from Table 1)
categories = [v.replace(",", " ").lower() for v in nutrition_data['Description'].values.tolist()]

# Your ingredients (from CSV)
ingredients = recipe_ingredients_df['ingredient'].values.tolist()


  from tqdm.autonotebook import tqdm, trange


In [5]:
# Takes 20 mins
from tqdm import tqdm
# Encode both the ingredients and categories as embeddings
category_embeddings = model.encode(categories, convert_to_tensor=True)

# Enable tqdm progress bar for pandas apply
tqdm.pandas()

# Example dataframe with an 'ingredient' column
df = pd.DataFrame({
    'ingredient': ingredients
})

# Function to classify a single ingredient
def classify_row(ingredient, model, categories, category_embeddings):
    ingredient_embedding = model.encode(ingredient, convert_to_tensor=True)
    cos_sim = util.pytorch_cos_sim(ingredient_embedding, category_embeddings)
    best_match_idx = cos_sim.argmax().item()
    best_match_score = cos_sim.max().item()
    return categories[best_match_idx] if best_match_score > 0.5 else "No Match"

# Add a new column to the dataframe with the classified ingredient
recipe_ingredients_df['formatted_ingredient'] = recipe_ingredients_df['ingredient'].progress_apply(
    lambda x: classify_row(x, model, categories, category_embeddings)
)

100%|██████████| 148327/148327 [23:16<00:00, 106.21it/s]


In [6]:
recipe_ingredients_df.to_csv("data/recipe_ingredients_with_new_names.csv", index=False)

In [15]:
price_data = nutrition_data[['Category','Description']]

In [16]:
import random
import pandas as pd
import json
# Sample common prices for 100 items
common_prices = None
# Open the JSON file and load it into a dictionary
with open('data/prices.json', 'r') as json_file:
    common_prices = json.load(json_file)

# Define price ranges for different categories
price_ranges = {
    "Spices and herbs": (2.00, 10.00),
    "Dairy products": (0.50, 5.00),
    "Meats and fish": (2.00, 8.00),
    "Fruits and vegetables": (0.20, 2.50),
    "Processed and packaged foods": (1.00, 5.00),
    "Grains, seeds, and nuts": (0.30, 4.00),
    "Miscellaneous": (0.50, 5.00)  # For items that don't fit neatly into a category
}

def assign_price(category):
    if category in common_prices:
        return common_prices[category]
    if "SPICE" in category or "HERB" in category:
        return round(random.uniform(*price_ranges["Spices and herbs"]), 2)
    elif "DAIRY" in category or any(dairy in category for dairy in ["BUTTER", "CHEESE", "MILK", "CREAM", "YOGURT"]):
        return round(random.uniform(*price_ranges["Dairy products"]), 2)
    elif "MEAT" in category or "FISH" in category or category in ["CHICKEN", "PORK", "BEEF", "TURKEY"]:
        return round(random.uniform(*price_ranges["Meats and fish"]), 2)
    elif any(veg in category for veg in ["APPLE", "CARROT", "POTATO", "TOMATO", "ONION"]):
        return round(random.uniform(*price_ranges["Fruits and vegetables"]), 2)
    elif any(processed in category for processed in ["BREAD", "PASTA"]):
        return round(random.uniform(*price_ranges["Processed and packaged foods"]), 2)
    elif any(grain in category for grain in ["WHEAT", "RICE", "NUT", "SEED"]):
        return round(random.uniform(*price_ranges["Grains, seeds, and nuts"]), 2)
    else:
        return round(random.uniform(*price_ranges["Miscellaneous"]), 2)

# Apply the price assignment function to each category
price_data['price_per_100g'] = price_data['Description'].apply(assign_price)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price_data['price_per_100g'] = price_data['Description'].apply(assign_price)


In [17]:
price_data.to_csv('data/ingredient_price.csv', index=False)