In [137]:
# Import and Install Necessary Packages
%pip install eep153_tools
%pip install python_gnupg
%pip install -U gspread_pandas

import pandas as pd
from eep153_tools.sheets import read_sheets
import re
from scipy.optimize import linprog as lp
import numpy as np

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Helper Function for Formatting IDs

This code defines a helper function `format_id` which takes an ID and an optional zero-padding parameter. It returns a formatted string version of the ID if possible. The function handles cases where the ID might be null, empty, or in a non-standard format. The code also sets a data URL for reference.

In [138]:
# Helper function to format an ID with optional zero-padding
def format_id(id, zeropadding=0):
    # Check if the id is null or an empty string or a dot, return None in such cases
    if pd.isnull(id) or id in ['', '.']:
        return None

    try:
        # Try converting the id to an integer and format it with zero-padding
        return ('%d' % id).zfill(zeropadding)
    except TypeError:
        # If a TypeError occurs, split the id by a period and take the first part, then remove any extra spaces and pad it
        return id.split('.')[0].strip().zfill(zeropadding)
    except ValueError:
        # If a ValueError occurs during conversion, return None
        return None

# URL to the data source (Google Spreadsheet)
data_url = "https://docs.google.com/spreadsheets/d/1z7hB1hWocUePYeoBpvR0_UW3LdX9MV82IwzpClkmsr4/edit?gid=1410082681#gid=1410082681"


### Load and Clean Data

This code loads the original recipes data from a Google Sheet, applies formatting to specific columns using the `format_id` helper function, and renames one of the columns for clarity.


In [139]:
# Load the original recipes data from the specified Google Sheet (sheet named "recipes")
og_recipes = read_sheets(data_url, sheet="recipes")

# Clean and transform the data:
# - Apply the format_id function to 'parent_foodcode' and 'ingred_code' columns to standardize their format
# - Rename the 'parent_desc' column to 'recipe' for better clarity
og_recipes = (og_recipes
              .assign(
                  parent_foodcode=lambda df: df["parent_foodcode"].apply(format_id),
                  ingred_code=lambda df: df["ingred_code"].apply(format_id)
              )
              .rename(columns={"parent_desc": "recipe"}))

In [151]:
# Define a list of key food items to INCLUDE in ingredient descriptions
key_foods = [
    "Yogurt, Greek", "Cheese, Cottage", "Milk", "Cheese, Parmesan", "Banana", "Apple", "Orange",
    "Avocado", "figs", "dates", "raisins", "apricots, dried", "Grapefruit", "Grapes",
    "pear", "Peach", "watermelon", "Oats", "Bread, rye", "Brown Rice", "Pasta",
    "Quinoa", "Rolled Oats", "Rice Cakes", "Whole Grain Cereal", "Special K", "Bread, whole-wheat", 
    "Bread, whole-mutligrain", "Popcorn, Air-popped", "Millet",
    "whole grain pasta", "Almonds", "Peanut Butter", "Chicken", "Egg", "Tofu",
    "Lentils", "Beans, Black", "Tuna", "Salmon", "Soup, Bean ",
    "Steak", "Tilapia", "Pork", "Venison", "Cod", "Ground turkey", "turkey, Ground",
    "beef, ground", "Ground beef", "Tempeh", "chickpea", "beans, kidney", "Sweet Potato", "Potato",
    "Spinach", "Broccoli", "Bell Pepper", "Carrot", "Beets", "peas", "Tomato",
    "Creatine", "Omega-3", "BCAAs", "Blueberries", "Strawberries", "juice, raw",
    "Garlic", "Lemon", "Onion", "Asparagus", "kale", "collards", "chard, swiss", "protein powder", "brussel sprouts", 
]

# Define a list of foods or terms to EXCLUDE
key_excludable = [
    "sugar", "syrup", "soda", "candy", "artificial", "processed", "preservative",
    "yolk", "Fruit juice", "juice drink", "Sunny D", "sweetened", "added sugar", "liver"
]

# Escape the items so that parentheses and other special characters are treated literally
escaped_key_foods = [re.escape(food) for food in key_foods]
escaped_excludable = [re.escape(term) for term in key_excludable]

# Wrap each escaped term in a non-capturing group '(?: ... )' before joining with '|'
include_pattern = '|'.join(f"(?:{term})" for term in escaped_key_foods)
exclude_pattern = '|'.join(f"(?:{term})" for term in escaped_excludable)

# 1) Include mask: meals that have at least one ingredient containing a key food
meal_mask_include = og_recipes.groupby('parent_foodcode')['ingred_desc'] \
    .transform(lambda x: x.str.contains(include_pattern, case=False, na=False).any())

# 2a) Exclude mask for INGREDIENTS: meals that have any ingredient containing an excludable term
meal_mask_exclude_ingredients = og_recipes.groupby('parent_foodcode')['ingred_desc'] \
    .transform(lambda x: x.str.contains(exclude_pattern, case=False, na=False).any())

# 2b) Exclude mask for RECIPE NAMES: meals whose recipe name contains an excludable term
meal_mask_exclude_names = og_recipes.groupby('parent_foodcode')['recipe'] \
    .transform(lambda x: x.str.contains(exclude_pattern, case=False, na=False).any())

# Combine both ingredient and recipe-name exclusions
meal_mask_exclude_total = meal_mask_exclude_ingredients | meal_mask_exclude_names

# 3) Final mask: include meals that pass the "include" filter AND do not match the exclusion filter
final_mask = meal_mask_include & (~meal_mask_exclude_total)

# Filter the original recipes dataset
recipes = og_recipes[final_mask]

In [152]:
# Load nutrition data and merge
nutrition = read_sheets(data_url, sheet="nutrients") \
            .assign(ingred_code=lambda df: df["ingred_code"].apply(format_id))

### Process and Aggregate Nutrient Information

This section makes a copy of the filtered recipes, normalizes ingredient weights to percentages, and merges nutrient information. Then, it scales nutrient values by their ingredient's normalized weight and aggregates the nutrient profile by meal. Finally, the code extracts recipe names for further use.

In [153]:
# Make an explicit copy of recipes before modifying
recipes = recipes.copy()

# Normalize ingredient weights to percentages by dividing by the total weight per meal.
# Using .loc for assignment ensures we're modifying the DataFrame in place.
recipes.loc[:, 'ingred_wt'] = recipes['ingred_wt'] / recipes.groupby('parent_foodcode')['ingred_wt'].transform("sum")

# Merge nutrient information into recipes on the 'ingred_code' column.
# This performs a left join, ensuring all recipes are kept.
df = recipes.merge(nutrition, how="left", on="ingred_code")

# Identify numeric columns (e.g., nutrient values) in the merged DataFrame.
numeric_cols = list(df.select_dtypes(include=["number"]).columns)

# Remove 'ingred_wt' from the list as we don't want to scale it.
numeric_cols.remove("ingred_wt")

# Multiply each nutrient value by the normalized ingredient weight to get weighted nutrient values.
df[numeric_cols] = df[numeric_cols].mul(df["ingred_wt"], axis=0)

# Aggregate nutrient profiles by meal (identified by 'parent_foodcode').
# For nutrient columns, sum their weighted values; for the recipe name, take the first occurrence.
df = df.groupby('parent_foodcode').agg({
    **{col: "sum" for col in numeric_cols},
    "recipe": "first"
})

# Rename the index to 'recipe_id' for clarity.
df.index.name = "recipe_id"

# Extract recipe names for further use.
food_names = df["recipe"]

### Load Prices and Map to Food Names

This code loads pricing data from a Google Sheet, applies ID formatting, and filters prices for a specific year. It then matches the price data with the corresponding recipes based on common food codes, maps the prices to food names, and prepares a transposed version of the nutrient data for further analysis.


In [154]:
# Load prices data from the "prices" sheet, selecting only the necessary columns.
prices = read_sheets(data_url, sheet="prices")[["food_code", "year", "price"]]

# Format the 'food_code' column using the helper function 'format_id'
prices["food_code"] = prices["food_code"].apply(format_id)

# Set a multi-index using 'year' and 'food_code' for easier slicing and alignment.
prices = prices.set_index(["year", "food_code"])

# Filter the prices data to include only records for the year "2017/2018".
prices = prices.xs("2017/2018", level="year")

# Remove rows where the price is missing.
prices = prices.dropna(subset="price")

# Find the intersection of food codes that are common between our aggregated recipes (df) and the prices data.
common_recipes = df.index.intersection(prices.index)

# Subset both the recipes and prices data to only include common recipes.
df = df.loc[common_recipes]
prices = prices.loc[common_recipes]

# Map the index (food codes) in the prices data to food names using the previously extracted 'food_names' series.
prices.index = prices.index.map(food_names)

# Transpose the nutrient data for further analysis or processing.
A_all = df.T

In [155]:
# Load RDA data (nutrient constraints)
rda = read_sheets(data_url, sheet="rda")
rda = rda.set_index("Nutrient")

# Define the Diet Minimizer Function

This function, `diet_minimizer`, uses linear programming to optimize a daily diet based on nutrient constraints for a given sex and athlete type. It constructs nutrient constraints from recommended dietary allowances (RDA) and upper limits (UL), then minimizes cost while meeting these constraints.

In [156]:
def diet_minimizer(sex, athlete_type):
    import numpy as np  
    group = f"{sex}_{athlete_type}"
    
    # Create nutrient constraints based on the chosen demographic
    bmin = pd.to_numeric(rda.loc[rda['Constraint Type'].isin(['RDA', 'AI']), group], errors='coerce')
    bmax = pd.to_numeric(rda.loc[rda['Constraint Type'].isin(['UL']), group], errors='coerce')

    # Remove non-finite values
    bmin = bmin[np.isfinite(bmin)]
    bmax = bmax[np.isfinite(bmax)]

    # Filter constraints to only include nutrients available in A_all.
    bmin = bmin[bmin.index.isin(A_all.index)]
    bmax = bmax[bmax.index.isin(A_all.index)]

    # Remove excluded foods from A_all and prices before optimization
    filtered_A_all = A_all.loc[~A_all.index.isin(key_excludable)]
    filtered_prices = prices.loc[~prices.index.isin(key_excludable)]

    # Ensure reindexing aligns with filtered food data
    Amin = filtered_A_all.reindex(bmin.index).dropna(how='all')
    Amax = filtered_A_all.reindex(bmax.index).dropna(how='all')

    # Combine constraints
    b = pd.concat([bmin, -bmax]).dropna()
    A = pd.concat([Amin, -Amax])

    # Convert to NumPy arrays
    b = b.to_numpy().flatten()  
    A = A.to_numpy()
    
    # Prepare cost vector (filtered)
    p = filtered_prices["price"].to_numpy()

    # Tolerance for negligible quantities
    tol = 1e-6

    # Import linear programming solver
    from scipy.optimize import linprog as lp

    # Check that b contains only finite values
    if not np.all(np.isfinite(b)):
        raise ValueError("The constraint vector b contains non-finite values!")

    # Solve the linear programming problem
    result = lp(p, -A, -b, method='highs')

    # Extract optimized diet quantities
    diet_quantities = pd.Series(result.x, index=filtered_prices.index)
    total_cost = result.fun  

    # Select foods with quantities above tolerance threshold
    selected_foods = diet_quantities[diet_quantities >= tol]

    # Create DataFrame listing foods and their cost per 100g
    df_foods = pd.DataFrame({
        "Food": selected_foods.index,
        "Cost per 100g": [float(filtered_prices.loc[food, 'price']) for food in selected_foods.index]
    })

    print(f"Your daily diet is ${total_cost:.2f}")
    return df_foods

In [157]:
diet_minimizer("Male", "Endurance")

Your daily diet is $3.80


Unnamed: 0,Food,Cost per 100g
0,"Milk, whole",0.09828
1,"Carp, baked or broiled, fat added",1.241814
2,"Egg, whole, fried with oil",0.398344
3,"Split peas, from dried, fat added",0.140336
4,"Peanut butter, lower sodium",0.51391
5,"Pasta, gluten free",0.114248
6,"Millet, no added fat",0.061534
7,Cereal (Kellogg's Special K),0.800496
8,"Orange juice, 100%, freshly squeezed",0.180944
9,"Potato, boiled, from fresh, peel eaten, made w...",0.236255


In [88]:
diet_minimizer("Male", "Strength")

Your daily diet is $4.88


Unnamed: 0,Food,Cost per 100g
0,"Almond milk, unsweetened, chocolate",0.15624
1,"Carp, baked or broiled, fat added",1.241814
2,"Egg, yolk only, raw",0.397392
3,"Pasta, gluten free",0.114248
4,"Millet, no added fat",0.061534
5,Kidney beans and white rice,0.133333
6,"Potato, boiled, from fresh, peel eaten, made w...",0.236255
7,"Potato chips, reduced fat, unsalted",1.053719
8,"Greens, NS as to form, cooked",0.368179


In [159]:
female_strength =diet_minimizer('Female', 'Strength')

Your daily diet is $5.26


In [158]:
def add_custom_ingredients(diet_df, new_foods):
    """
    Adds the cheapest meal(s) matching each item in `new_foods` to `diet_df`,
    based on the globally available 'recipes' and 'prices' DataFrames.

    Parameters
    ----------
    diet_df : pd.DataFrame
        The DataFrame representing the current daily diet (output of diet_minimizer).
        Must have columns ["Food", "Cost per 100g"] at minimum.
    new_foods : list of str
        A list of ingredient/food names to look for in the meal/recipe names.

    Returns
    -------
    pd.DataFrame
        An updated version of `diet_df` with new rows added for each matched meal.
    """
    # Make a copy so we don't modify the original in place
    updated_diet = diet_df.copy()

    for food in new_foods:
        # 1) Find all rows in the global 'recipes' DataFrame whose 'recipe' name
        #    contains the desired food (case-insensitive).
        mask = recipes['recipe'].str.contains(food, case=False, na=False)
        matching_meals = recipes[mask]['parent_foodcode'].unique()

        if len(matching_meals) == 0:
            print(f"No match found for '{food}' in recipe names.")
            continue

        # 2) Among all matching meals, find the cheapest by looking up prices
        cheapest_price = float('inf')
        cheapest_meal_name = None

        for meal_code in matching_meals:
            # 'df' has aggregated nutrient info with index = meal_code
            # 'prices' has index = meal_name (string)
            # We can get the meal's name from df.loc[meal_code, 'recipe'] if it exists
            if meal_code not in df.index:
                # This meal might not have made it into the aggregated df
                # if it was filtered out or lacked prices.
                continue

            meal_name = df.loc[meal_code, 'recipe']

            # Check if this meal_name exists in the prices index
            if meal_name not in prices.index:
                continue

            meal_cost = prices.loc[meal_name, 'price']
            if meal_cost < cheapest_price:
                cheapest_price = meal_cost
                cheapest_meal_name = meal_name

        if cheapest_meal_name is None:
            print(f"No priced meal found for '{food}' among matches.")
            continue

        # 3) Append this cheapest meal to our diet DataFrame
        new_row = pd.DataFrame({
            "Food": [cheapest_meal_name],
            "Cost per 100g": [cheapest_price]
        })
        updated_diet = pd.concat([updated_diet, new_row], ignore_index=True)

    return updated_diet


In [161]:
add_custom_ingredients(female_strength, ["rice cake"])

TypeError: '<' not supported between instances of 'str' and 'float'