## Recipe Data With Ingredients

### Format Google Sheet Data

In [1]:
%pip install eep153_tools
%pip install python_gnupg
%pip install -U gspread_pandas

def format_id(id,zeropadding=0):
    """Nice string format for any id, string or numeric.

    Optional zeropadding parameter takes an integer
    formats as {id:0z} where
    """
    if pd.isnull(id) or id in ['','.']: return None

    try:  # If numeric, return as string int
        return ('%d' % id).zfill(zeropadding)
    except TypeError:  # Not numeric
        return id.split('.')[0].strip().zfill(zeropadding)
    except ValueError:
        return None

data_url = "https://docs.google.com/spreadsheets/d/1z7hB1hWocUePYeoBpvR0_UW3LdX9MV82IwzpClkmsr4/edit?gid=1410082681#gid=1410082681"

Collecting eep153_tools
  Using cached eep153_tools-0.12.4-py2.py3-none-any.whl.metadata (363 bytes)
Using cached eep153_tools-0.12.4-py2.py3-none-any.whl (4.9 kB)
Installing collected packages: eep153_tools
Successfully installed eep153_tools-0.12.4
Note: you may need to restart the kernel to use updated packages.
Collecting python_gnupg
  Using cached python_gnupg-0.5.4-py2.py3-none-any.whl.metadata (2.0 kB)
Using cached python_gnupg-0.5.4-py2.py3-none-any.whl (21 kB)
Installing collected packages: python_gnupg
Successfully installed python_gnupg-0.5.4
Note: you may need to restart the kernel to use updated packages.
Collecting gspread_pandas
  Using cached gspread_pandas-3.3.0-py2.py3-none-any.whl.metadata (10 kB)
Using cached gspread_pandas-3.3.0-py2.py3-none-any.whl (27 kB)
Installing collected packages: gspread_pandas
  Attempting uninstall: gspread_pandas
    Found existing installation: gspread-pandas 2.2.3
    Uninstalling gspread-pandas-2.2.3:
      Successfully uninstalled g

### Filter Existing Recipe Data (using ingred_desc) By Key Foods

In [2]:
import pandas as pd
from eep153_tools.sheets import read_sheets

# Read and clean recipes data
recipes = read_sheets(data_url, sheet="recipes")
recipes = (recipes
           .assign(parent_foodcode=lambda df: df["parent_foodcode"].apply(format_id),
                   ingred_code=lambda df: df["ingred_code"].apply(format_id))
           .rename(columns={"parent_desc": "recipe"}))

# Filter recipes based on key food presence in the ingredient description
key_foods = [
    "Greek Yogurt", "Cottage Cheese", "Low-Fat Milk", "Parmesan Cheese", "Olive Oil",
    "Banana", "Apple", "Orange", "Avocado", "figs", "dates", "raisins", "Dried apricots",
    "Grapefruit", "Grapes", "pear", "Peach", "watermelon", "Oats", "Whole Wheat Bread",
    "Brown Rice", "Pasta", "Quinoa", "Rolled Oats", "Rice Cakes", "Whole Grain Cereal",
    "Special K cereal", "Dark Rye Bread", "Whole wheat Bread", "Sourdough Bread", "Air-popped Popcorn",
    "Millet", "whole grain pasta", "Almonds", "Peanut Butter", "Chicken Breast", "Eggs", "Tofu",
    "Lentils", "Black Beans", "Tuna", "Salmon", "Lean Beef", "Bean Soup", "Hummus", "Sirloin Steak",
    "Tilapia", "Ground Beef", "Pork Tenderloin", "Venison", "Cod", "Ground turkey", "Tempeh",
    "chickpea", "kidney beans", "Sweet Potato", "Potato", "Spinach", "Broccoli", "Bell Pepper",
    "Carrot", "Beets", "peas", "Tomato", "Creatine", "Omega-3", "Whey Protein", "BCAAs",
    "Blueberries", "Strawberries", "Garlic", "Lemon", "Onion", "Asparagus"
]


pattern = '|'.join(key_foods)
meal_mask = recipes.groupby('parent_foodcode')['ingred_desc'] \
                   .transform(lambda x: x.str.contains(pattern, case=False, na=False).any())
recipes = recipes[meal_mask]

# Continue with merging with nutrition data, normalization, etc.

### Retrieve Nutrition Data From Google Sheet

In [3]:
nutrition = (read_sheets(data_url, sheet="nutrients")
             .assign(ingred_code = lambda df: df["ingred_code"].apply(format_id)))

display(nutrition.head())
nutrition.columns

Unnamed: 0,ingred_code,Ingredient description,Capric acid,Lauric acid,Myristic acid,Palmitic acid,Palmitoleic acid,Stearic acid,Oleic acid,Linoleic Acid,...,Vitamin B12,"Vitamin B-12, added",Vitamin B6,Vitamin C,Vitamin D,Vitamin E,"Vitamin E, added",Vitamin K,Water,Zinc
0,1001,"Butter, salted",2.529,2.587,7.436,21.697,0.961,9.999,19.961,2.728,...,0.17,0.0,0.003,0.0,0.0,2.32,0.0,7.0,15.87,0.09
1,1002,"Butter, whipped, with salt",2.039,2.354,7.515,20.531,1.417,7.649,17.37,2.713,...,0.07,0.0,0.008,0.0,0.0,1.37,0.0,4.6,16.72,0.05
2,1003,"Butter oil, anhydrous",2.495,2.793,10.005,26.166,2.228,12.056,25.026,2.247,...,0.01,0.0,0.001,0.0,0.0,2.8,0.0,8.6,0.24,0.01
3,1004,"Cheese, blue",0.601,0.491,3.301,9.153,0.816,3.235,6.622,0.536,...,1.22,0.0,0.166,0.0,0.5,0.25,0.0,2.4,42.41,2.66
4,1005,"Cheese, brick",0.585,0.482,3.227,8.655,0.817,3.455,7.401,0.491,...,1.26,0.0,0.065,0.0,0.5,0.26,0.0,2.5,41.11,2.6


Index(['ingred_code', 'Ingredient description', 'Capric acid', 'Lauric acid',
       'Myristic acid', 'Palmitic acid', 'Palmitoleic acid', 'Stearic acid',
       'Oleic acid', 'Linoleic Acid', 'Linolenic Acid', 'Stearidonic acid',
       'Eicosenoic acid', 'Arachidonic acid', 'Eicosapentaenoic acid',
       'Erucic acid', 'Docosapentaenoic acid', 'Docosahexaenoic acid',
       'Butyric acid', 'Caproic acid', 'Caprylic acid', 'Alcohol', 'Caffeine',
       'Calcium', 'Carbohydrate', 'Carotene, alpha', 'Carotene, beta',
       'Cholesterol', 'Choline', 'Copper', 'Cryptoxanthin, beta', 'Energy',
       'Fatty acids, total monounsaturated',
       'Fatty acids, total polyunsaturated', 'Fatty acids, total saturated',
       'Dietary Fiber', 'Folate, DFE', 'Folate, food', 'Folate', 'Folic acid',
       'Iron', 'Lutein + zeaxanthin', 'Lycopene', 'Magnesium', 'Niacin',
       'Phosphorus', 'Potassium', 'Protein', 'Retinol', 'Riboflavin',
       'Selenium', 'Sodium', 'Sugars, total', 'Theobromin

In [4]:
# normalize weights to percentage terms. 
recipes['ingred_wt'] = recipes['ingred_wt']/recipes.groupby(['parent_foodcode'])['ingred_wt'].transform("sum")

# we're going to extend the recipes data frame to include the nutrient profiles of its ingredients (in 100g)
df = recipes.merge(nutrition, how="left", on="ingred_code")

# multiply all nutrients per 100g of an ingredient by the weight of that ingredient in a recipe.
numeric_cols = list(df.select_dtypes(include=["number"]).columns)
numeric_cols.remove("ingred_wt")
df[numeric_cols] = df[numeric_cols].mul(df["ingred_wt"], axis=0)

# sum nutrients of food codes (over the multiple ingredients)
# python tip: one can merge dictionaries dict1 dict2 using **, that is: dict_merge = {**dict1, **dict2}. The ** effectively "unpacks" the key value pairs in each dictionary
df = df.groupby('parent_foodcode').agg({**{col: "sum" for col in numeric_cols},
                                        "recipe": "first"})

df.index.name = "recipe_id"

food_names = df["recipe"]
print(food_names.head())
food_names.head()

recipe_id
11435100     Yogurt, Greek, with oats
11440020       Dill dip, yogurt based
11440030      Onion dip, yogurt based
11440050    Spinach dip, yogurt based
11440060                 Tzatziki dip
Name: recipe, dtype: object


recipe_id
11435100     Yogurt, Greek, with oats
11440020       Dill dip, yogurt based
11440030      Onion dip, yogurt based
11440050    Spinach dip, yogurt based
11440060                 Tzatziki dip
Name: recipe, dtype: object

In [5]:
prices = read_sheets(data_url, sheet="prices")[["food_code", "year", "price"]]

prices["food_code"] = prices["food_code"].apply(format_id)

prices = prices.set_index(["year", "food_code"])
print(prices.index.levels[0])

# we'll focus on the latest price data
prices = prices.xs("2017/2018", level="year")

# drop rows of prices where the price is "NA"
prices = prices.dropna(subset="price")

print(f"We have prices for {prices.shape[0]} unique recipes (FNDDS food codes)")

Index(['2011/2012', '2013/2014', '2015/2016', '2017/2018'], dtype='object', name='year')
We have prices for 4435 unique recipes (FNDDS food codes)


### Dietary Requirements

In [6]:
rda = read_sheets(data_url, sheet="rda")

rda = rda.set_index("Nutrient")

rda.columns, rda.head()

(Index(['Nutrient Type', 'Unit', 'Constraint Type', 'Child_1_3', 'Female_4_8',
        'Male_4_8', 'Female_9_13', 'Male_9_13', 'Female_14_18', 'Male_14_18',
        'Female_19_30', 'Male_19_30', 'Female_31_50', 'Male_31_50',
        'Female_51U', 'Male_51U', 'Male_Endurance', 'Female_Endurance',
        'Male_Strength'],
       dtype='object'),
               Nutrient Type  Unit Constraint Type  Child_1_3  Female_4_8  \
 Nutrient                                                                   
 Energy                Macro  kcal             RDA     1000.0      1200.0   
 Protein               Macro     g             RDA       13.0        19.0   
 Carbohydrate          Macro     g             RDA      130.0       130.0   
 Dietary Fiber         Macro     g             RDA       14.0        16.8   
 Linoleic Acid         Macro     g              AI        7.0        10.0   
 
                Male_4_8  Female_9_13  Male_9_13  Female_14_18  Male_14_18  \
 Nutrient                         

### Putting It All Together

In [7]:
common_recipes = df.index.intersection(prices.index)

# python tip: given a list of indices, "loc" both subsets and sorts. 
df = df.loc[common_recipes]
prices = prices.loc[common_recipes]

# lets remap the price dataframe index to be the actual food names.
prices.index = prices.index.map(food_names)

A_all = df.T

print(prices.head())
print(A_all.head())

                              price
Yogurt, Greek, with oats   0.719278
Dill dip, yogurt based     1.071189
Onion dip, yogurt based    1.117195
Spinach dip, yogurt based  0.940725
Tzatziki dip               1.217789
                 11435100  11440020  11440030  11440050  11440060  11440070  \
Capric acid        0.1769  0.026495  0.046649  0.046649  0.148681  0.026495   
Lauric acid        0.0958  0.024518  0.043945  0.043945  0.071358  0.024518   
Myristic acid     0.14835  0.084132  0.148629  0.148324  0.179395  0.084132   
Palmitic acid      0.6576  1.030168  1.619307  1.628675  0.963237  1.030168   
Palmitoleic acid   0.0184  0.020959  0.034082  0.034082  0.070907  0.020959   

                  11551050  11553100  11553110  11553120  ... 92550110  \
Capric acid       0.024369  0.020521  0.020532   0.02305  ...      0.0   
Lauric acid       0.025968  0.023292  0.023305   0.02546  ...      0.0   
Myristic acid      0.09064  0.076085  0.076124  0.086572  ...      0.0   
Palmitic acid

In [8]:
# pick a demographic (column from rda dataframe)
'''
select from 
['Child_1_3', 'Female_4_8', 'Male_4_8', 'Female_9_13', 'Male_9_13', 
'Female_14_18', 'Male_14_18','Female_19_30', 'Male_19_30', 
'Female_31_50', 'Male_31_50', 'Female_51U', 'Male_51U']
'''
group = "Child_1_3"

# create lower bounds and upper bounds.
bmin = rda.loc[rda['Constraint Type'].isin(['RDA', 'AI']), group]
bmax = rda.loc[rda['Constraint Type'].isin(['UL']), group]

# reindex ensures we only keep nutrients in bmin/bmax
Amin = A_all.reindex(bmin.index).dropna(how='all')
Amax = A_all.reindex(bmax.index).dropna(how='all')

b = pd.concat([bmin, -bmax])
A = pd.concat([Amin, -Amax])

#python tip: by typing "=" after the variable name inside the curly braces, it formats the output so we don't have to write f"variable = {variable}"
print(f"{bmin.shape=}")
print(f"{Amin.shape=}")
print(f"{bmax.shape=}")
print(f"{Amax.shape=}")
print(f"{b.shape=}")
print(f"{A.shape=}")
print(f"{prices.shape=}")

bmin.shape=(26,)
Amin.shape=(26, 1601)
bmax.shape=(1,)
Amax.shape=(1, 1601)
b.shape=(27,)
A.shape=(27, 1601)
prices.shape=(1601, 1)


### Minimize Cost Data

In [10]:
from  scipy.optimize import linprog as lp
import numpy as np
p = prices
tol = 1e-6 # Numbers in solution smaller than this (in absolute value) treated as zeros
result = lp(p, -A, -b, method='highs')

result

        message: Optimization terminated successfully. (HiGHS Status 7: Optimal)
        success: True
         status: 0
            fun: 2.2689787760288436
              x: [ 0.000e+00  0.000e+00 ...  0.000e+00  0.000e+00]
            nit: 14
          lower:  residual: [ 0.000e+00  0.000e+00 ...  0.000e+00
                              0.000e+00]
                 marginals: [ 5.566e-01  9.951e-01 ...  6.533e-01
                              8.615e-02]
          upper:  residual: [       inf        inf ...        inf
                                    inf]
                 marginals: [ 0.000e+00  0.000e+00 ...  0.000e+00
                              0.000e+00]
          eqlin:  residual: []
                 marginals: []
        ineqlin:  residual: [ 1.849e+02  3.885e+01 ...  1.345e+02
                              0.000e+00]
                 marginals: [-0.000e+00 -0.000e+00 ... -0.000e+00
                             -5.999e-05]
 mip_node_count: 0
 mip_dual_bound: 0.0
        mip

In [11]:
# lets mess with the index on price df so they are recipe names not ids.

# get the result x in a series with food names
diet = pd.Series(result.x,index=prices.index)

print("\nYou'll be eating (in 100s of grams or milliliters):")
print(round(diet[diet >= tol], 2))


You'll be eating (in 100s of grams or milliliters):
Carp, baked or broiled, fat added                                0.45
Split peas, from dried, fat added                                2.68
Oatmeal, regular or quick, made with milk, no added fat          3.74
Orange juice, 100%, with calcium added, frozen, reconstituted    1.21
Potato, boiled, from fresh, peel eaten, made with margarine      0.28
Potato, roasted, from fresh, peel eaten, no added fat            1.89
dtype: float64
