# imports

In [1]:
import pandas as pd
import numpy as np
import re
import tqdm
import itertools
import json
import inflect

In [2]:
def print_bold(str):
    print('\033[1m' + str + '\033[0m')

# files paths

In [3]:
#where to find useful data
food_des_path = "./../data/usda/FOOD_DES.txt"
food_groups_path = "./../data/usda/FD_GROUP.txt"
nut_data_path = "./../data/usda/NUT_DATA.txt"
nut_def_path = "./../data/usda/NUTR_DEF.txt"

all_paths = [food_des_path, food_groups_path, nut_data_path, nut_def_path]

# tilde removal

#text columns are indicated by '~', remove them
for p in all_paths :
    string = open(p).read()
    new_str = re.sub('[~]', '', string)
    open(p, 'w').write(new_str)

# Load and filter food group description

### info about the different food groups contained in the USDA database

In [4]:
columns = ["food_group_id", "food_group_name"]

food_groups = pd.read_csv(food_groups_path, sep="^", encoding="ISO-8859-1", names=columns, header=None)

food_groups.set_index("food_group_id", inplace=True)

food_groups

Unnamed: 0_level_0,food_group_name
food_group_id,Unnamed: 1_level_1
100,Dairy and Egg Products
200,Spices and Herbs
300,Baby Foods
400,Fats and Oils
500,Poultry Products
600,"Soups, Sauces, and Gravies"
700,Sausages and Luncheon Meats
800,Breakfast Cereals
900,Fruits and Fruit Juices
1000,Pork Products


### we decide to drop :

>* Baby food (300)
>* dressing in 400
>* soup in 600
>* Breakfast Cereals (800)
>* Beverages (1400) except water
>* Baked products (1800) 
>* Sweets (1900) except Baking products (chocolate)
>* Fast Foods (2100)
>* Meals, Entrees, and Side Dishes (2200)
>* Snacks (2500)
>* Restaurant food (3600)


# Load and clean food description table

#### Info about each individual product

In [5]:
#take a description-like syntax and return the list of words, filtering  no negation
def split_des_in_list(des) :
    
    cats = re.sub("[()]", "", des).strip().lower().split(',')    
    final_list = []
    
    for c in cats :        
        words_list = c.strip().split(" ")
        
        if (("no" not in words_list) and ("without" not in words_list)) :
            final_list.extend([c.strip().lower() for c in words_list if c != "with"])
    
    return final_list
    
    
#return [w.strip() for c in des.split(",") for w in c.strip().lower().split(' ')]

#singularize a word if plural
def singularize_word(x) :
    if engine.singular_noun(x) :
        return engine.singular_noun(x)
    else :
        return x
        
#clean the description(lowercases, strips, singularization)
def format_long_des(x) :
        
    if str(x) == "nan" :
        return ""    
    
    split = split_des_in_list(x)
    
    #decompose description
    words = [c for c in split]
    
    #singularize words
    sing_words = [singularize_word(x) for x in words]
    
    #rebuild description
    return str(" ".join(sing_words))

def concat_common_and_des(common, des) :
    if common != "" :
        return common.split(" ") + des.split(" ")
    else :
        return des.split(" ")

In [6]:
#generate singularization engine
engine = inflect.engine()

#columns we want to retrieve from the database files
columns = ["food_id", "food_group_id", "long_description", "common_names"]
use_cols = [0, 1, 2, 4]

#get the info from file
food_des = pd.read_csv(food_des_path, sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)

#display original table format
print_bold("original table format : ")
print(food_des.head())


# generate search_words
food_des['search_words'] = food_des.apply(lambda row : concat_common_and_des(format_long_des(row['common_names']),
                                                                             format_long_des(row['long_description'])),
                                                                             axis=1)

#drop common_names column
food_des = food_des.drop("common_names", axis=1)
            
print_bold("\n\nnew table format")
food_des.head()

[1moriginal table format : [0m
   food_id  food_group_id            long_description common_names
0     1001            100              Butter, salted          NaN
1     1002            100  Butter, whipped, with salt          NaN
2     1003            100       Butter oil, anhydrous          NaN
3     1004            100                Cheese, blue          NaN
4     1005            100               Cheese, brick          NaN
[1m

new table format[0m


Unnamed: 0,food_id,food_group_id,long_description,search_words
0,1001,100,"Butter, salted","[butter, salted]"
1,1002,100,"Butter, whipped, with salt","[butter, whipped, salt]"
2,1003,100,"Butter oil, anhydrous","[butter, oil, anhydrou]"
3,1004,100,"Cheese, blue","[cheese, blue]"
4,1005,100,"Cheese, brick","[cheese, brick]"


#### drop the unwanted food groups

In [7]:
print("number of entries before full food groups dropping : ", len(food_des), "\n")


#drop whole categories
food_des = food_des[~food_des["food_group_id"].isin([300, 800, 1800, 2100, 2200, 2500, 3600])]

print("number of entries before after food groups dropping : ", len(food_des), "\n")

#drop parts of categories

food_des = food_des[~((food_des["food_group_id"]==1400) & (~food_des['search_words'].apply(lambda x : "water" in x)))]
food_des = food_des[~((food_des["food_group_id"]==400) & (food_des['search_words'].apply(lambda x : "dressing" in x)))]
food_des = food_des[~((food_des["food_group_id"]==600) & (food_des['search_words'].apply(lambda x : "soup" in x)))]
food_des = food_des[~((food_des["food_group_id"]==1900) & (food_des['search_words'].apply(lambda x : "baking" in x)))]

print("number of entries after partial food groups dropping : ", len(food_des), "\n")


number of entries before full food groups dropping :  7793 

number of entries before after food groups dropping :  6058 

number of entries after partial food groups dropping :  5495 



#### Drop lines with unwanted categories

In [8]:
#define categories
type_1_categories = set(["cooked", "roasted", "boiled", "grilled", "braised", 
                     "ready-to-serve", "fried", "baked", "pan-fried", "oven-roasted"])

print("items count before type 1 deletion : ", len(food_des))

#delete items that have type 1 categories
food_des = food_des[food_des['search_words'].apply(lambda x : len(set(x).intersection(type_1_categories)) == 0)]

print("items count after type 1 deletion : ", len(food_des))

items count before type 1 deletion :  5495
items count after type 1 deletion :  3545


#### define a search function for mapping recipes ingredients

In [9]:
food_des[food_des['long_description'].str.contains(' table')]

Unnamed: 0,food_id,food_group_id,long_description,search_words
49,1050,100,"Cream, fluid, light (coffee cream or table cream)","[cream, fluid, light, coffee, cream, or, table..."
308,2047,200,"Salt, table","[salt, table]"
5602,19113,1900,"Syrups, table blends, pancake, with butter","[syrup, table, blend, pancake, butter]"
5617,19128,1900,"Syrups, table blends, pancake, reduced-calorie","[syrup, table, blend, pancake, reduced-calorie]"
5618,19129,1900,"Syrups, table blends, pancake","[syrup, table, blend, pancake]"
5698,19225,1900,"Desserts, rennin, tablets, unsweetened","[dessert, rennin, tablet, unsweetened]"
5777,19337,1900,"Sweeteners, tabletop, aspartame, EQUAL, packets","[sweetener, tabletop, aspartame, equal, packet]"
5788,19360,1900,"Syrups, table blends, pancake, with 2% maple","[syrup, table, blend, pancake, 2%, maple]"
5789,19362,1900,"Syrups, table blends, corn, refiner, and sugar","[syrup, table, blend, corn, refiner, and, sugar]"
5852,19720,1900,"Syrups, table blends, pancake, with 2% maple, ...","[syrup, table, blend, pancake, 2%, maple, adde..."


In [10]:
def search_ingredient(ingredient, print_search_candidates=False) :
    
    #do not penalize the presence of those words, 'table' is for the salt
    non_complexificators = set(["fresh", "raw", "skin", "peel", "whole"])
    
    def search_score(categories, ing_words) :
        
        # singularize search words
        ing_words = set([singularize_word(x) for x in ing_words])
        
        #prioritize matching query terms
        nb_matching = len(ing_words.intersection(set(categories)))
        
        #non_complexificators should not be penalized,ignore them AFTER computing number of matching words
        categories = [c for c in categories if (c not in non_complexificators)]
        
        
        #matching keywords one by one 
        matching = [len(set([x]).intersection(ing_words)) != 0 for x in categories]
            
        
        #first keywords are more important
        weights = np.linspace(2, 1, num=len(matching))
        weights = weights / sum(weights)
        
        #the query should have as many ingredients words as possible
        score = (10 * nb_matching) + sum([c[0] * c[1] for c in zip(matching, weights)])
        
        return score
    
    
    ing_words = set(ingredient.split(" "))       
    
    #compute search score for each entry and sort them by score (descending order)
    food_des["search_score"] = food_des["search_words"].apply(lambda x : search_score(x, ing_words))  
    food_des_sorted = food_des.sort_values(by=['search_score'], ascending=False)

    #print best candidates
    if print_search_candidates :
        print(food_des_sorted.head())

    #best score
    result = food_des_sorted[["food_id", "search_words", "search_score"]].head(1)        

    #check if we found a positive score
    if result["search_score"].values[0] != 0 :
        return result, result["search_score"].values[0]
    else :
        return None, 0
    
    
" ".join(search_ingredient("asparagus", print_search_candidates=True)[0]['search_words'].values[0])

      food_id  food_group_id  \
2439    11011           1100   
2443    11018           1100   
2442    11015           1100   
2937    11707           1100   
2441    11013           1100   

                                       long_description  \
2439                                     Asparagus, raw   
2443                      Asparagus, frozen, unprepared   
2442                  Asparagus, canned, drained solids   
2937  Asparagus, canned, no salt added, solids and l...   
2441  Asparagus, canned, regular pack, solids and li...   

                                           search_words  search_score  
2439                                    [asparagu, raw]     11.000000  
2443                     [asparagu, frozen, unprepared]     10.444444  
2442                 [asparagu, canned, drained, solid]     10.333333  
2937             [asparagu, canned, solid, and, liquid]     10.266667  
2441  [asparagu, canned, regular, pack, solid, and, ...     10.190476  


'asparagu raw'

In [11]:
ingredients_ids = {}
total_ing_count = 0
mapped_ing_count = 0
ingredients = json.load(open("./../generated/ingredients_count.json"))['count']

for k, v in tqdm.tqdm(ingredients.items()) :
    #print(k, v)
    cats, score = search_ingredient(k, print_search_candidates=False)
    total_ing_count += v
    
    #we found at list one matching word
    if score >= 10 :
        mapped_ing_count += v
        ingredients_ids[k] = int(cats['food_id'].values[0])
        
#save mapping
json.dump(ingredients_ids, open("./../generated/ing_usda_mapping.json", 'w'))

100%|██████████| 5163/5163 [33:56<00:00,  2.55it/s]


> Good, the mapping is done ! How many ingredients were we able to map effectively ?

In [12]:
print_bold("{0:.2f}".format(100 * len(ingredients_ids) / len(ingredients)) + "% of ingredients were mapped succesfully")

[1m86.85% of ingredients were mapped succesfully[0m


> It occurs that some ingredients are much more common that others, thus, it is more important to map those ingredients than to map the less frequent ones. What if we weight the success rate given the number of times the ingredients appear in recipes ?

In [13]:
print_bold("you have {0:.2f}".format(100 * mapped_ing_count / total_ing_count) + "% of chance to find a matching database entry for any recipe ingredients entry")

[1myou have 96.52% of chance to find a matching database entry for any recipe ingredients entry[0m


> Pretty good, but it is not worth a lot if we have a low-quality mapping. What are the mappings for the most frequent ingredients ?

In [14]:
mapping = json.load(open("./../generated/ing_usda_mapping.json"))
[(c, food_des[food_des['food_id'] == mapping[c]]['long_description'].values[0]) for c in mapping][:20]

[('salt', 'Salt, table'),
 ('onion', 'Onions, raw'),
 ('butter', 'Butter, without salt'),
 ('water', 'Water, bottled, generic'),
 ('egg', 'Egg, whole, raw, fresh'),
 ('sugar', 'Sugars, brown'),
 ('olive oil', 'Oil, olive, salad or cooking'),
 ('black pepper', 'Spices, pepper, black'),
 ('pepper', 'Pepper, banana, raw'),
 ('garlic', 'Garlic, raw'),
 ('all-purpose flour', 'Wheat flour, white, all-purpose, unenriched'),
 ('tomato', 'Tomato powder'),
 ('milk', 'Milk, dry, whole, without added vitamin D'),
 ('garlic clove', 'Garlic, raw'),
 ('vegetable oil', 'Vegetable oil, palm kernel'),
 ('cilantro', 'Coriander (cilantro) leaves, raw'),
 ('parsley', 'Parsley, fresh'),
 ('vanilla', 'Vanilla extract'),
 ('ginger', 'Ginger root, raw'),
 ('lemon juice', 'Lemon juice, raw')]

> The results are good enough, let's just correct the tomato and milk mappings not to feel guilty

In [15]:
# investigate the tomato entries
food_des[food_des['search_words'].apply(lambda x : 'tomato' in x)]

Unnamed: 0,food_id,food_group_id,long_description,search_words,search_score
1398,6972,600,"Sauce, tomato chili sauce, bottled, with salt","[sauce, tomato, chili, sauce, bottled, salt]",0.0
2836,11527,1100,"Tomatoes, green, raw","[tomato, green, raw]",0.0
2837,11529,1100,"Tomatoes, red, ripe, raw, year round average","[tomato, red, ripe, raw, year, round, average]",0.0
2839,11531,1100,"Tomatoes, red, ripe, canned, packed in tomato ...","[tomato, red, ripe, canned, packed, in, tomato...",0.0
2840,11533,1100,"Tomatoes, red, ripe, canned, stewed","[tomato, red, ripe, canned, stewed]",0.0
2841,11537,1100,"Tomatoes, red, ripe, canned, with green chilies","[tomato, red, ripe, canned, green, chily]",0.0
2842,11540,1100,"Tomato juice, canned, with salt added","[tomato, juice, canned, salt, added]",0.0
2843,11546,1100,"Tomato products, canned, paste, without salt a...","[tomato, product, canned, paste]",0.0
2844,11547,1100,"Tomato products, canned, puree, without salt a...","[tomato, product, canned, puree]",0.0
2845,11548,1100,Tomato powder,"[tomato, powder]",0.0


In [16]:
# investigate the milk entries
food_des[food_des['search_words'].apply(lambda x : 'milk' in x)]

Unnamed: 0,food_id,food_group_id,long_description,search_words,search_score
25,1026,100,"Cheese, mozzarella, whole milk","[cheese, mozzarella, whole, milk]",0.000000
26,1027,100,"Cheese, mozzarella, whole milk, low moisture","[cheese, mozzarella, whole, milk, low, moisture]",0.000000
27,1028,100,"Cheese, mozzarella, part skim milk","[cheese, mozzarella, part, skim, milk]",0.000000
35,1036,100,"Cheese, ricotta, whole milk","[cheese, ricotta, whole, milk]",0.000000
36,1037,100,"Cheese, ricotta, part skim milk","[cheese, ricotta, part, skim, milk]",0.000000
57,1059,100,"Milk, filled, fluid, with blend of hydrogenate...","[milk, filled, fluid, blend, of, hydrogenated,...",0.000000
58,1060,100,"Milk, filled, fluid, with lauric acid oil","[milk, filled, fluid, lauric, acid, oil]",0.000000
66,1071,100,"Dessert topping, powdered, 1.5 ounce prepared ...","[dessert, topping, powdered, 1.5, ounce, prepa...",0.000000
70,1076,100,"Milk substitutes, fluid, with lauric acid oil","[milk, substitute, fluid, lauric, acid, oil]",0.000000
71,1077,100,"Milk, whole, 3.25% milkfat, with added vitamin D","[milk, whole, 3.25%, milkfat, added, vitamin, d]",0.000000


In [17]:
mapping['milk']   = 1078
mapping['tomato'] = 11695

#save corrected mapping
json.dump(mapping, open("./../generated/ing_usda_mapping.json", 'w'))

# nutrient and RDI linking

#### import RDI values

In [48]:
rdi = pd.read_excel("./../data/RDI.xlsx")
rdi_nutrients = rdi['nutrient'].values
rdi_nutrients

array(['alpha-linoleic acid', 'Biotin', 'Calcium', 'Carbohydrate',
       'Chloride', 'Choline', 'Chromium', 'Copper', 'Fat', 'Fiber',
       'Fluoride', 'Folate', 'Iodine', 'Iron', 'Linoleic acid',
       'Magnesium', 'Manganese', 'Molybdenum', 'Monousaturated fat',
       'Niacin', 'Pantothenic acid', 'Phosphorus', 'Polyunsaturated fat',
       'Potassium', 'Protein', 'Riboflavin', 'Saturated fat', 'Selenium',
       'Sodium', 'Sugar', 'Thiamin', 'Vitamin A', 'Vitamin B6',
       'Vitamin B12', 'Vitamin C', 'Vitamin D', 'Vitamin E', 'Vitamin K',
       'Water', 'Zinc'], dtype=object)

#### Load and filter Nutrient definition

In [49]:
use_cols = [0, 1, 2, 3]

columns = ["nutrient_id", "units", "tagname", "description"]

nut_def = pd.read_csv(nut_def_path, sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)

nut_def.head(5)

Unnamed: 0,nutrient_id,units,tagname,description
0,203,g,PROCNT,Protein
1,204,g,FAT,Total lipid (fat)
2,205,g,CHOCDF,"Carbohydrate, by difference"
3,207,g,ASH,Ash
4,208,kcal,ENERC_KCAL,Energy


#### map rdi values with corresponding USDA nutrients

In [50]:
#try to map rdi elements with database elements automatically
mapping = {}
still_unmapped = list(rdi_nutrients)

for r in rdi_nutrients :
    
    mapped_count = 0
    sev_des = []
    
    for des in nut_def['description'].values :
        if ((r in des) or (des in r)) :
            
            sev_des.append(des)
            mapped_count += 1
            
            
    if mapped_count == 1 :
        mapping[r] = sev_des[0]
        still_unmapped.remove(r)
    
    elif mapped_count > 1:
        print("\nconflict for ", r , " : ")
        [print("\t- ", c) for c in sev_des]
        print("\n")
            
    else :
        print("No mapping for ", r)

print_bold("mapping found for the following nutrients : ")
mapping

No mapping for  alpha-linoleic acid
No mapping for  Biotin
No mapping for  Chloride
No mapping for  Chromium

conflict for  Fat  : 
	-  Fatty acids, total trans
	-  Fatty acids, total saturated
	-  Fatty acids, total monounsaturated
	-  Fatty acids, total polyunsaturated
	-  Fatty acids, total trans-monoenoic
	-  Fatty acids, total trans-polyenoic



conflict for  Folate  : 
	-  Folate, total
	-  Folate, food
	-  Folate, DFE


No mapping for  Iodine
No mapping for  Linoleic acid
No mapping for  Molybdenum
No mapping for  Monousaturated fat
No mapping for  Polyunsaturated fat
No mapping for  Saturated fat

conflict for  Vitamin A  : 
	-  Vitamin A, IU
	-  Vitamin A, RAE


No mapping for  Vitamin B6
No mapping for  Vitamin B12

conflict for  Vitamin D  : 
	-  Vitamin D
	-  Vitamin D2 (ergocalciferol)
	-  Vitamin D3 (cholecalciferol)
	-  Vitamin D (D2 + D3)



conflict for  Vitamin E  : 
	-  Vitamin E (alpha-tocopherol)
	-  Vitamin E, added


[1mmapping found for the following nutrients 

{'Calcium': 'Calcium, Ca',
 'Carbohydrate': 'Carbohydrate, by difference',
 'Choline': 'Choline, total',
 'Copper': 'Copper, Cu',
 'Fiber': 'Fiber, total dietary',
 'Fluoride': 'Fluoride, F',
 'Iron': 'Iron, Fe',
 'Magnesium': 'Magnesium, Mg',
 'Manganese': 'Manganese, Mn',
 'Niacin': 'Niacin',
 'Pantothenic acid': 'Pantothenic acid',
 'Phosphorus': 'Phosphorus, P',
 'Potassium': 'Potassium, K',
 'Protein': 'Protein',
 'Riboflavin': 'Riboflavin',
 'Selenium': 'Selenium, Se',
 'Sodium': 'Sodium, Na',
 'Sugar': 'Sugars, total',
 'Thiamin': 'Thiamin',
 'Vitamin C': 'Vitamin C, total ascorbic acid',
 'Vitamin K': 'Vitamin K (phylloquinone)',
 'Water': 'Water',
 'Zinc': 'Zinc, Zn'}

In [51]:
# solve conficlts manually
mapping['Folate'] = "Folate, total"
mapping['Vitamin A'] = "Vitamin A, RAE"
mapping['Vitamin D'] = "Vitamin D (D2 + D3)"
mapping["Vitamin E"] = "Vitamin E (alpha-tocopherol)"
mapping["Monounsaturated fat"] = "Fatty acids, total monounsaturated"
mapping["Polyunsaturated fat"] = "Fatty acids, total polyunsaturated"
mapping["Saturated fat"] = "Fatty acids, total saturated"
mapping['alpha-linoleic acid'] = "Alanine"
mapping["Vitamin B6"] = "Vitamin B-6"
mapping["Vitamin B12"] = "Vitamin B-12"
mapping['Fat'] = "Total lipid (fat)"

#we do not want to keep the USDA name, change it in the database
mapping['Linoleic acid'] = "Linoleic acid"
nut_def["description"] = nut_def["description"].replace("18:2 undifferentiated", "Linoleic acid")



conflicts_solved = ["Folate", "Vitamin A", "Vitamin D", "Vitamin E", "Saturated fat",
                   "Monounsaturated fat", "Polyunsaturated fat", "alpha-linoleic acid",
                   "Vitamin B6", "Vitamin B12", "Fat", "Linoleic acid"]

still_unmapped = [su for su in still_unmapped if (su not in conflicts_solved)]


print_bold("no mapping found for the following nutrients : ")

still_unmapped

[1mno mapping found for the following nutrients : [0m


['Biotin',
 'Chloride',
 'Chromium',
 'Iodine',
 'Molybdenum',
 'Monousaturated fat']

#### change elements names in rdi data, add nutrient_id column

In [52]:
# change rdi elements names
rdi['nutrient'] = rdi['nutrient'].apply(lambda x : mapping[x] if x in mapping.keys() else x)
rdi = rdi[~(rdi['nutrient'].apply(lambda x : x in still_unmapped))]
rdi.set_index("nutrient", inplace=True)

#add nutrient_id column
#rdi["nutrient_id"] = rdi['element'].apply(lambda x : nut_def[nut_def['description'] == x]['nutrient_id'].values[0])

rdi.head()

#filter nut_def to keep only mapped elements
nut_def = nut_def[nut_def['description'].apply(lambda x : x in rdi.index.values)]

#### change RDI values to match USDA units

In [53]:
def change_value(unit, value) :
    if unit == 'µg' :
        return float(value) * 1000
    elif unit == 'g' :
        return float(value) / 1000
    else :
        return float(value)
    
#change values
rdi['Male_RDI(19-30)'] = pd.Series([change_value(c[0], c[1]) for c in zip(nut_def['units'].values, rdi['Male_RDI(19-30)'])]).values
rdi['Female_RDI(19-30)'] = pd.Series([change_value(c[0], c[1]) for c in zip(nut_def['units'].values, rdi['Female_RDI(19-30)'])]).values

#save new RDI file as csv
rdi.to_csv("./../generated/matching_rdi.csv")

rdi.head()

Unnamed: 0_level_0,Male_RDI(19-30),Female_RDI(19-30)
nutrient,Unnamed: 1_level_1,Unnamed: 2_level_1
Alanine,1.6,1.1
"Calcium, Ca",1.0,1.0
"Carbohydrate, by difference",130.0,130.0
"Choline, total",0.55,0.425
"Copper, Cu",0.0009,0.0009


# load and filter nutrient data

#### load nutrient data

In [54]:
use_cols = [0, 1, 2]

columns = ["food_id", "nutrient_id", "nutr_per_100g"]

nut_data = pd.read_csv(nut_data_path, sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)

nut_data.head()

Unnamed: 0,food_id,nutrient_id,nutr_per_100g
0,1001,208,717.0
1,1001,262,0.0
2,1001,263,0.0
3,1001,268,2999.0
4,1001,301,24.0


#### keep only nutrients that are included in RDI

In [55]:
print_bold("length before filtering : " + str(len(nut_data)))

#drop non-exploitable lines
nut_data = nut_data[nut_data['nutrient_id'].apply(lambda x : x in nut_def['nutrient_id'].values)]

print_bold("length after filtering : " + str(len(nut_data)))

#replace id by name to have more convenient reading
nut_data['nutrient'] = nut_data['nutrient_id'].apply(lambda x : nut_def[nut_def['nutrient_id'] == x]['description'].values[0])
nut_data = nut_data.drop("nutrient_id", axis=1)

[1mlength before filtering : 644125[0m
[1mlength after filtering : 229821[0m


In [56]:
#what average percentage of info do we have per mapped food ?
perc = 100 * nut_data.groupby('food_id').count().mean()[0] / len(rdi)
print_bold("percentage of available nutritional information : " + str(perc))

[1mpercentage of available nutritional information : 86.73734346811996[0m


> Pretty nice ! 

#### add rdi percentage columns

In [57]:
nut_data["percentage_male_rdi"] = nut_data.apply(lambda row : row['nutr_per_100g'] / rdi.loc[row['nutrient']]['Male_RDI(19-30)'], axis=1)
nut_data["percentage_female_rdi"] = nut_data.apply(lambda row : row['nutr_per_100g'] / rdi.loc[row['nutrient']]['Female_RDI(19-30)'], axis=1)
nut_data.head()

Unnamed: 0,food_id,nutr_per_100g,nutrient,percentage_male_rdi,percentage_female_rdi
4,1001,24.0,"Calcium, Ca",24.0,24.0
5,1001,2.0,"Magnesium, Mg",0.005,0.006452
6,1001,24.0,"Phosphorus, P",0.034286,0.034286
7,1001,24.0,"Potassium, K",5e-06,5e-06
8,1001,643.0,"Sodium, Na",0.428667,0.428667


#### pivot table to facilitate search by food id, save the resulting dataframe

In [58]:
#pivot table
nut_data = nut_data.pivot(index='food_id', columns='nutrient', values=['nutr_per_100g', 'percentage_male_rdi', 'percentage_female_rdi'])

#save table using h5 (easier for multi-index table storage)
nut_data.to_hdf('./../generated/nut_data.h5','table', append=True)

In [59]:
#check that the storage went fine
a = pd.read_hdf('./../generated/nut_data.h5', 'table', where=['index>2'])
a.loc[1001]['nutr_per_100g']

nutrient
Alanine                                 0.029
Calcium, Ca                            24.000
Carbohydrate, by difference             0.060
Choline, total                         18.800
Copper, Cu                              0.000
Fatty acids, total polyunsaturated      3.043
Fatty acids, total saturated           51.368
Fiber, total dietary                    0.000
Fluoride, F                             2.800
Folate, total                           3.000
Iron, Fe                                0.020
Linoleic acid                           2.728
Magnesium, Mg                           2.000
Manganese, Mn                           0.000
Niacin                                  0.042
Pantothenic acid                        0.110
Phosphorus, P                          24.000
Potassium, K                           24.000
Protein                                 0.850
Riboflavin                              0.034
Selenium, Se                            1.000
Sodium, Na               