# imports

In [1]:
import pandas as pd
import numpy as np
import re

# files paths

In [2]:
food_des_path = "data/FOOD_DES.txt"
food_groups_path = "data/FD_GROUP.txt"
nut_data_path = "data/NUT_DATA.txt"
nut_def_path = "data/NUTR_DEF.txt"

all_paths = [food_des_path, food_groups_path, nut_data_path, nut_def_path]

# tilde removal

In [3]:
#text columns are indicated by '~', remove them
for p in all_paths :
    string = open(p).read()
    new_str = re.sub('[~]', '', string)
    open(p, 'w').write(new_str)

# Food group description

In [4]:
columns = ["food_group_id", "food_group_name"]

food_groups = pd.read_csv(food_groups_path, sep="^", encoding="ISO-8859-1", names=columns, header=None)

food_groups.set_index("food_group_id", inplace=True)

food_groups

Unnamed: 0_level_0,food_group_name
food_group_id,Unnamed: 1_level_1
100,Dairy and Egg Products
200,Spices and Herbs
300,Baby Foods
400,Fats and Oils
500,Poultry Products
600,"Soups, Sauces, and Gravies"
700,Sausages and Luncheon Meats
800,Breakfast Cereals
900,Fruits and Fruit Juices
1000,Pork Products


# food description table

In [5]:
use_cols = [0, 1, 2, 3, 4]

columns = ["food_id", "food_group_id", "long_description", "short_description", "common_names"]

food_des = pd.read_csv(food_des_path, sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)

food_des.head(34)

Unnamed: 0,food_id,food_group_id,long_description,short_description,common_names
0,1001,100,"Butter, salted","BUTTER,WITH SALT",
1,1002,100,"Butter, whipped, with salt","BUTTER,WHIPPED,W/ SALT",
2,1003,100,"Butter oil, anhydrous","BUTTER OIL,ANHYDROUS",
3,1004,100,"Cheese, blue","CHEESE,BLUE",
4,1005,100,"Cheese, brick","CHEESE,BRICK",
5,1006,100,"Cheese, brie","CHEESE,BRIE",
6,1007,100,"Cheese, camembert","CHEESE,CAMEMBERT",
7,1008,100,"Cheese, caraway","CHEESE,CARAWAY",
8,1009,100,"Cheese, cheddar (Includes foods for USDA's Foo...","CHEESE,CHEDDAR",
9,1010,100,"Cheese, cheshire","CHEESE,CHESHIRE",


# we decide to drop :

* Baby food (300)
* dressing in 400
* soup in 600
* Breakfast Cereals (800)
* Beverages (1400)
* Baked products (1800) 
* Sweets (1900) except Baking products (chocolate)
* Fast Foods (2100)
* Meals, Entrees, and Side Dishes (2200)
* Snacks (2500)
* Restaurant food (3600)


In [6]:
print(food_des.count(), "\n")

#drop whole categories
food_des = food_des[~food_des["food_group_id"].isin([300, 800, 1400, 1800, 2100, 2200, 2500, 3600])]

print(food_des.count(), "\n")

#drop parts of categories
food_des = food_des[~ ((food_des["food_group_id"]==400) & (food_des["long_description"].str.contains("dressing")))]
food_des = food_des[~ ((food_des["food_group_id"]==600) & (food_des["long_description"].str.contains("soup")))]
food_des = food_des[~ ((food_des["food_group_id"]==1900) & (~food_des["long_description"].str.contains("Baking")))]

print(food_des.count())

food_id              7793
food_group_id        7793
long_description     7793
short_description    7793
common_names         1070
dtype: int64 

food_id              5692
food_group_id        5692
long_description     5692
short_description    5692
common_names          923
dtype: int64 

food_id              5277
food_group_id        5277
long_description     5277
short_description    5277
common_names          918
dtype: int64


# Studying food decription categories

In [7]:
#look at long descriptions, extract keywords and count them
categories = food_des["long_description"].apply(lambda x : [c.strip() for c in x.split(",")]).aggregate(sum)
pd.Series(categories).value_counts()[:60]

cooked                           1711
raw                              1329
Beef                              959
separable lean and fat            771
separable lean only               661
boneless                          487
trimmed to 0" fat                 467
canned                            430
trimmed to 1/8" fat               399
roasted                           359
boiled                            355
choice                            338
Pork                              324
fresh                             306
Lamb                              295
imported                          289
drained                           279
all grades                        255
braised                           246
select                            243
loin                              239
Fish                              230
frozen                            226
with salt                         226
without salt                      224
Chicken                           201
New Zealand 

## Because we are only considering ingredients, we drop the following categories :
* cooked
* roasted
* boiled
* grilled
* braised
* Babyfood
* Soup
* Candies
* Cereals ready-to-eat
* ready-to-serve
* fried
* baked
* pan-fried

In [9]:
to_remove = ["cooked", "roasted", "boiled", "grilled", "braised", "Babyfood", "Soup",
             "Candies", "Cereals ready-to-eat", "ready-to-serve", "fried", "baked", "pan-fried"]

for tr in to_remove :
    food_des = food_des[~food_des["long_description"].str.contains(tr)]
    
food_des.count()

food_id              3111
food_group_id        3111
long_description     3111
short_description    3111
common_names          419
dtype: int64

# Nutrient data

In [8]:
use_cols = [0, 1, 2]

columns = ["food_id", "nutrient_id", "nutr_per_100g"]

nut_data = pd.read_csv(nut_data_path, sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)

nut_data.head()

Unnamed: 0,food_id,nutrient_id,nutr_per_100g
0,1001,208,717.0
1,1001,262,0.0
2,1001,263,0.0
3,1001,268,2999.0
4,1001,301,24.0


In [10]:
#pivot table to have all nutrients per food_id
nut_data = nut_data.pivot(index='food_id', columns='nutrient_id', values='nutr_per_100g')
nut_data.head(10)

nutrient_id,203,204,205,207,208,209,210,211,212,213,...,696,697,851,852,853,855,856,857,858,859
food_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,0.85,81.11,0.06,2.11,717.0,,,,,,...,,,0.315,,,,,,,
1002,0.49,78.3,0.0,1.62,731.0,,,,,,...,,0.0,0.285,0.005,0.081,,0.008,,0.02,
1003,0.28,99.48,0.0,0.0,876.0,,,,,,...,,,,,,,,,,
1004,21.4,28.74,2.34,5.11,353.0,,,,,,...,,,,,,,,,,
1005,23.24,29.68,2.79,3.18,371.0,,,,,,...,,,,,,,,,,
1006,20.75,27.68,0.45,2.7,334.0,,,,,,...,,,,,,,,,,
1007,19.8,24.26,0.46,3.68,300.0,,,,,,...,,,,,,,,,,
1008,25.18,29.2,3.06,3.28,376.0,,,,,,...,,,,,,,,,,
1009,22.87,33.31,3.37,3.71,403.0,,0.0,0.26,0.0,0.12,...,,0.0,0.108,0.001,0.036,,0.002,,0.009,
1010,23.37,30.6,4.78,3.6,387.0,,,,,,...,,,,,,,,,,


# Nutrient definition

In [11]:
use_cols = [0, 1, 2, 3]

columns = ["nutrient_id", "units", "tagname", "description"]

nut_def = pd.read_csv(nut_def_path, sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)

nut_def.head(5)

Unnamed: 0,nutrient_id,units,tagname,description
0,203,g,PROCNT,Protein
1,204,g,FAT,Total lipid (fat)
2,205,g,CHOCDF,"Carbohydrate, by difference"
3,207,g,ASH,Ash
4,208,kcal,ENERC_KCAL,Energy


In [12]:
# wtf
nut_def[["description", "tagname"]][80:]

Unnamed: 0,description,tagname
80,Glutamic acid,GLU_G
81,Glycine,GLY_G
82,Proline,PRO_G
83,Serine,SER_G
84,Hydroxyproline,HYP
85,"Vitamin E, added",
86,"Vitamin B-12, added",
87,Cholesterol,CHOLE
88,"Fatty acids, total trans",FATRN
89,"Fatty acids, total saturated",FASAT


# food profiles

In [13]:
#ignore this cell
def group_id(group_name) :
    return food_groups.index[food_groups["food_group_name"] == group_name].tolist()[0]


#retrieve nteresting ids
fruits_group_id = group_id("~Fruits and Fruit Juices~")
vegetables_goup_id = group_id("~Vegetables and Vegetable Products~")
spices_group_id = group_id("~Spices and Herbs~")
nut_group_id = group_id("~Nut and Seed Products~")

print("fruit id : ", fruits_group_id)

#retrieve food description
all_fruits_names = food_des[(food_des["food_group_id"] == fruits_group_id)]
all_fruits_names = all_fruits_names[all_fruits_names["short_description"].str.contains("RAW") 
                                   & ~all_fruits_names["short_description"].str.contains("SMOOTHIE")
                                   & ~all_fruits_names["short_description"].str.contains("CND")]

all_fruits_names



IndexError: list index out of range