In [1]:
# Import and Install Necessary Packages
%pip install eep153_tools
%pip install python_gnupg
%pip install -U gspread_pandas

import pandas as pd
from eep153_tools.sheets import read_sheets
import re
from scipy.optimize import linprog as lp
import numpy as np

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Helper Function for Formatting IDs

This code defines a helper function `format_id` which takes an ID and an optional zero-padding parameter. It returns a formatted string version of the ID if possible. The function handles cases where the ID might be null, empty, or in a non-standard format. The code also sets a data URL for reference.

In [2]:
# Helper function to format an ID with optional zero-padding
def format_id(id, zeropadding=0):
    # Check if the id is null or an empty string or a dot, return None in such cases
    if pd.isnull(id) or id in ['', '.']:
        return None

    try:
        # Try converting the id to an integer and format it with zero-padding
        return ('%d' % id).zfill(zeropadding)
    except TypeError:
        # If a TypeError occurs, split the id by a period and take the first part, then remove any extra spaces and pad it
        return id.split('.')[0].strip().zfill(zeropadding)
    except ValueError:
        # If a ValueError occurs during conversion, return None
        return None

# URL to the data source (Google Spreadsheet)
data_url = "https://docs.google.com/spreadsheets/d/1z7hB1hWocUePYeoBpvR0_UW3LdX9MV82IwzpClkmsr4/edit?gid=1410082681#gid=1410082681"


### Load and Clean Data

This code loads the original recipes data from a Google Sheet, applies formatting to specific columns using the `format_id` helper function, and renames one of the columns for clarity.


In [4]:
# Load the original recipes data from the specified Google Sheet (sheet named "recipes")
og_recipes = read_sheets(data_url, sheet="recipes")

# Clean and transform the data:
# - Apply the format_id function to 'parent_foodcode' and 'ingred_code' columns to standardize their format
# - Rename the 'parent_desc' column to 'recipe' for better clarity
og_recipes = (og_recipes
              .assign(
                  parent_foodcode=lambda df: df["parent_foodcode"].apply(format_id),
                  ingred_code=lambda df: df["ingred_code"].apply(format_id)
              )
              .rename(columns={"parent_desc": "recipe"}))

In [5]:
# Define a list of key food items to search for in ingredient descriptions
key_foods = [
    "Yogurt, Greek", "Cheese, Cottage", "Milk", "Cheese, Parmesan", "Banana", "Apple", "Orange",
    "Avocado", "figs", "dates", "raisins", "apricots, dried", "Grapefruit", "Grapes",
    "pear", "Peach", "watermelon", "Oats", "Bread, rye", "Brown Rice", "Pasta",
    "Quinoa", "Rolled Oats", "Rice Cakes", "Whole Grain Cereal", "Special K", "Bread, whole-wheat", 
    "Bread, whole-mutligrain", "Popcorn, Air-popped", "Millet",
    "whole grain pasta", "Almonds", "Peanut Butter", "Chicken", "Egg", "Tofu",
    "Lentils", "Beans, Black", "Tuna", "Salmon", "Lean", "raw", "Soup, Bean ",
    "Steak", "Tilapia", "Pork", "Venison", "Cod", "Ground turkey", "turkey, Ground",
    "beef, ground", "Ground beef", "Tempeh", "chickpea", "beans, kidney", "Sweet Potato", "Potato",
    "Spinach", "Broccoli", "Bell Pepper", "Carrot", "Beets", "peas", "Tomato",
    "Creatine", "Omega-3", "BCAAs", "Blueberries", "Strawberries",
    "Garlic", "Lemon", "Onion", "Asparagus", "kale", "collards", "chard, swiss", "protein powder", "brussel sprouts", 
]

# Escape each key food item to safely use them in a regular expression
escaped_key_foods = [re.escape(food) for food in key_foods]

# Create a single regex pattern that matches any of the key foods
pattern = '|'.join(escaped_key_foods)

# Group the recipes by 'parent_foodcode' (meal identifier)
# For each meal, check if any ingredient description contains one of the key foods (case-insensitive)
meal_mask = og_recipes.groupby('parent_foodcode')['ingred_desc'] \
                   .transform(lambda x: x.str.contains(pattern, case=False, na=False).any())

# Filter the original recipes to keep only meals where at least one key food was found
recipes = og_recipes[meal_mask]

In [6]:
# Load nutrition data and merge
nutrition = read_sheets(data_url, sheet="nutrients") \
            .assign(ingred_code=lambda df: df["ingred_code"].apply(format_id))

### Process and Aggregate Nutrient Information

This section makes a copy of the filtered recipes, normalizes ingredient weights to percentages, and merges nutrient information. Then, it scales nutrient values by their ingredient's normalized weight and aggregates the nutrient profile by meal. Finally, the code extracts recipe names for further use.

In [8]:
# Make an explicit copy of recipes before modifying
recipes = recipes.copy()

# Normalize ingredient weights to percentages by dividing by the total weight per meal.
# Using .loc for assignment ensures we're modifying the DataFrame in place.
recipes.loc[:, 'ingred_wt'] = recipes['ingred_wt'] / recipes.groupby('parent_foodcode')['ingred_wt'].transform("sum")

# Merge nutrient information into recipes on the 'ingred_code' column.
# This performs a left join, ensuring all recipes are kept.
df = recipes.merge(nutrition, how="left", on="ingred_code")

# Identify numeric columns (e.g., nutrient values) in the merged DataFrame.
numeric_cols = list(df.select_dtypes(include=["number"]).columns)

# Remove 'ingred_wt' from the list as we don't want to scale it.
numeric_cols.remove("ingred_wt")

# Multiply each nutrient value by the normalized ingredient weight to get weighted nutrient values.
df[numeric_cols] = df[numeric_cols].mul(df["ingred_wt"], axis=0)

# Aggregate nutrient profiles by meal (identified by 'parent_foodcode').
# For nutrient columns, sum their weighted values; for the recipe name, take the first occurrence.
df = df.groupby('parent_foodcode').agg({
    **{col: "sum" for col in numeric_cols},
    "recipe": "first"
})

# Rename the index to 'recipe_id' for clarity.
df.index.name = "recipe_id"

# Extract recipe names for further use.
food_names = df["recipe"]

### Load Prices and Map to Food Names

This code loads pricing data from a Google Sheet, applies ID formatting, and filters prices for a specific year. It then matches the price data with the corresponding recipes based on common food codes, maps the prices to food names, and prepares a transposed version of the nutrient data for further analysis.


In [9]:
# Load prices data from the "prices" sheet, selecting only the necessary columns.
prices = read_sheets(data_url, sheet="prices")[["food_code", "year", "price"]]

# Format the 'food_code' column using the helper function 'format_id'
prices["food_code"] = prices["food_code"].apply(format_id)

# Set a multi-index using 'year' and 'food_code' for easier slicing and alignment.
prices = prices.set_index(["year", "food_code"])

# Filter the prices data to include only records for the year "2017/2018".
prices = prices.xs("2017/2018", level="year")

# Remove rows where the price is missing.
prices = prices.dropna(subset="price")

# Find the intersection of food codes that are common between our aggregated recipes (df) and the prices data.
common_recipes = df.index.intersection(prices.index)

# Subset both the recipes and prices data to only include common recipes.
df = df.loc[common_recipes]
prices = prices.loc[common_recipes]

# Map the index (food codes) in the prices data to food names using the previously extracted 'food_names' series.
prices.index = prices.index.map(food_names)

# Transpose the nutrient data for further analysis or processing.
A_all = df.T

In [10]:
# Load RDA data (nutrient constraints)
rda = read_sheets(data_url, sheet="rda")
rda = rda.set_index("Nutrient")

### Define the Diet Minimizer Function

This function, `diet_minimizer`, uses linear programming to optimize a daily diet based on nutrient constraints for a given sex and athlete type. It constructs nutrient constraints from recommended dietary allowances (RDA) and upper limits (UL), then minimizes cost while meeting these constraints.


In [12]:
# Define the diet_minimizer function to optimize daily diet based on nutrient constraints.
def diet_minimizer(sex, athlete_type):
    # Import necessary libraries
    import numpy as np  # Import numpy for numerical operations
    
    # Construct the demographic group string based on input parameters.
    group = f"{sex}_{athlete_type}"
    
    # Create nutrient constraints based on the chosen demographic.
    # bmin: Minimum nutrient requirements (RDA or AI) for the group.
    bmin = pd.to_numeric(rda.loc[rda['Constraint Type'].isin(['RDA', 'AI']), group], errors='coerce')
    # bmax: Maximum nutrient limits (UL) for the group.
    bmax = pd.to_numeric(rda.loc[rda['Constraint Type'].isin(['UL']), group], errors='coerce')
    
    # Remove non-finite values from both constraints.
    bmin = bmin[np.isfinite(bmin)]
    bmax = bmax[np.isfinite(bmax)]
    
    # Reindex the nutrient matrix A_all to align with the available nutrients in bmin and bmax.
    Amin = A_all.reindex(bmin.index).dropna(how='all')
    Amax = A_all.reindex(bmax.index).dropna(how='all')
    
    # Combine constraints: For minimization, we treat upper limits as negative constraints.
    b = pd.concat([bmin, -bmax]).dropna()
    A = pd.concat([Amin, -Amax])
    
    # Prepare cost vector: Prices represent cost per 100g.
    p = prices
    
    # Define tolerance to filter out negligible quantities.
    tol = 1e-6
    
    # Import the linear programming solver from SciPy.
    from scipy.optimize import linprog as lp
    
    # Check that the constraint vector b contains only finite values.
    if not np.all(np.isfinite(b)):
        raise ValueError("The constraint vector b contains non-finite values!")
    
    # Solve the linear programming problem:
    # The objective is to minimize cost (p), subject to nutrient constraints (A and b).
    result = lp(p, -A, -b, method='highs')
    
    # Extract the optimized diet quantities for each food.
    diet_quantities = pd.Series(result.x, index=prices.index)
    total_cost = result.fun  # Total cost in dollars
    
    # Select foods with quantities above the tolerance threshold.
    selected_foods = diet_quantities[diet_quantities >= tol]
    
    # Create a DataFrame listing the selected foods and their cost per 100g.
    df_foods = pd.DataFrame({
        "Food": selected_foods.index,
        "Cost per 100g": [float(prices.loc[food, 'price']) for food in selected_foods.index]
    })
    
    # Output the total daily cost.
    print(f"Your daily diet is ${total_cost:.2f}")
    
    # Return the DataFrame with selected foods and their costs.
    return df_foods

In [13]:
diet_minimizer("Male", "Endurance")

Your daily diet is $3.40


Unnamed: 0,Food,Cost per 100g
0,"Chicken liver, braised",0.469032
1,"Carp, steamed or poached",0.782609
2,"Egg, yolk only, cooked, NS as to fat",0.403279
3,Chinese pancake,0.121931
4,"Pasta, gluten free",0.114248
5,"Millet, no added fat",0.061534
6,"Potato, boiled, from fresh, peel eaten, made w...",0.236255
7,"Mustard greens, fresh, cooked, fat added",0.258616


In [14]:
diet_minimizer("Female", "Strength")

Your daily diet is $3.26


Unnamed: 0,Food,Cost per 100g
0,"Milk, whole",0.09828
1,"Carp, steamed or poached",0.782609
2,"Egg, yolk only, raw",0.397392
3,"Peanuts, unroasted",0.466486
4,Chinese pancake,0.121931
5,"Potato, boiled, from fresh, peel eaten, made w...",0.236255
6,"Ripe plantain, raw",0.212355
