# imports

In [None]:
import pandas as pd
import numpy as np
import re
import itertools
import json
import inflect

# files paths

In [None]:
food_des_path = "data/FOOD_DES.txt"
food_groups_path = "data/FD_GROUP.txt"
nut_data_path = "data/NUT_DATA.txt"
nut_def_path = "data/NUTR_DEF.txt"

all_paths = [food_des_path, food_groups_path, nut_data_path, nut_def_path]

# tilde removal

In [None]:
#text columns are indicated by '~', remove them
for p in all_paths :
    string = open(p).read()
    new_str = re.sub('[~]', '', string)
    open(p, 'w').write(new_str)

# Food group description

In [None]:
columns = ["food_group_id", "food_group_name"]

food_groups = pd.read_csv(food_groups_path, sep="^", encoding="ISO-8859-1", names=columns, header=None)

food_groups.set_index("food_group_id", inplace=True)

food_groups

# food description table

In [None]:
def singularize_word(x) :
    if engine.singular_noun(x) :
        return engine.singular_noun(x)
    else :
        return x
        
def format_long_des(x) :
        
    #decompose description
    words = [c.strip().lower().split(' ') for c in x.split(",")]
    
    #singularize words
    sing_words = [[singularize_word(x) for x in c] for c in words]
    
    #rebuild description
    return ",".join([" ".join(c) for c in sing_words])



use_cols = [0, 1, 2, 4]
engine = inflect.engine()

columns = ["food_id", "food_group_id", "long_description", "common_names"]

food_des = pd.read_csv(food_des_path, sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)
food_des['long_description'] = food_des['long_description'].apply(lambda x : format_long_des(x))


#food_des['categories'] = food_des['long_description'].apply(lambda x : [c.strip() for c in x.split(",")])
food_des['categories'] = food_des['long_description'].apply(lambda x : " ".join([c.strip() for c in x.split(",")[:2]]))
food_des = food_des.drop_duplicates(subset = ['categories'])
food_des = food_des.drop("long_description", axis=1)
food_des['categories'] = food_des['categories'].apply(lambda x : x.split(" "))

# we decide to drop :

* Baby food (300)
* dressing in 400
* soup in 600
* Breakfast Cereals (800)
* Beverages (1400)
* Baked products (1800) 
* Sweets (1900) except Baking products (chocolate)
* Fast Foods (2100)
* Meals, Entrees, and Side Dishes (2200)
* Snacks (2500)
* Restaurant food (3600)


In [None]:
print(food_des.count(), "\n")

#drop whole categories
food_des = food_des[~food_des["food_group_id"].isin([300, 800, 1400, 1800, 2100, 2200, 2500, 3600])]

print(food_des.count(), "\n")

#drop parts of categories

food_des = food_des[~((food_des["food_group_id"]==400) & (food_des['categories'].apply(lambda x : "dressing" in x)))]
food_des = food_des[~((food_des["food_group_id"]==600) & (food_des['categories'].apply(lambda x : "soup" in x)))]
food_des = food_des[~((food_des["food_group_id"]==1900) & (food_des['categories'].apply(lambda x : "baking" in x)))]

print(food_des.count())

# recipe Ingredients loading

In [None]:
ingredients = json.load(open("generated/ingredients_count.json"))
ingredients = list(ingredients['count'].keys())

#### Drop all rows that contain no ingredients words

In [None]:
[nb_in_ingredients(categories_words(food_des['categories'].values[c])) for c in range(1, 20)]

In [None]:
def categories_words(cat) :
    return set([x.strip() for c in cat for x in c.strip().split(" ")])

#build set with all ingredient words
ingredients_words = set(" ".join(ingredients).split(" "))

#build set of all distinct category words
category_words = set(list(itertools.chain.from_iterable([x.split(' ') for c in food_des["categories"].values for x in c])))



# all category words not contained in the ingredients
print(len(category_words - ingredients_words))

# all ingredient words not contained in the categories
print(len(ingredients_words - category_words))

def nb_in_ingredients(words_set) :
    return len(words_set.intersection(ingredients_words))


food_des['common_db_words'] = food_des['categories'].apply(lambda x : nb_in_ingredients(categories_words(x)))
food_des[food_des['common_db_words'] == 0]

In [None]:
ing = ingredients[1]
print("search for ", ing)
food_des[food_des['categories'].apply(lambda x : len(set(x).intersection(set(ing.split(" ")))) != 0)]

#### Type 1 categories

In [None]:
#define categories
type_1_categories = set(["cooked", "roasted", "boiled", "grilled", "braised", 
                     "ready-to-serve", "fried", "baked", "pan-fried"])

print("items count before type 1 deletion : ", len(food_des))

#delete items that have type 1 categories
food_des = food_des[food_des['categories'].apply(lambda x : len(set(x).intersection(type_1_categories)) == 0)]

print("items count after type 1 deletion : ", len(food_des))

#### search

In [None]:
food_des.head(1)["nb_w"]

In [None]:
def search_ingredient(ingredient) :
    print("search for ", ingredient)
    ing_words = set(ingredient.split(" "))    

    food_des["nb_w"] = food_des["categories"].apply(lambda x : len(set(x).intersection(ing_words)) / len(set(x)))
    result = food_des.sort_values(by=['nb_w'], ascending=False).head(1)
    
    
    if result["nb_w"].values[0] != 0 :
        return result
    
    else:
        print("no result")
    
    return 
    
search_ingredient("honey")

# Nutrient data

In [None]:
use_cols = [0, 1, 2]

columns = ["food_id", "nutrient_id", "nutr_per_100g"]

nut_data = pd.read_csv(nut_data_path, sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)

nut_data.head()

In [None]:
#pivot table to have all nutrients per food_id
nut_data = nut_data.pivot(index='food_id', columns='nutrient_id', values='nutr_per_100g')
nut_data.head(10)

# Nutrient definition

In [None]:
use_cols = [0, 1, 2, 3]

columns = ["nutrient_id", "units", "tagname", "description"]

nut_def = pd.read_csv(nut_def_path, sep="^", encoding="ISO-8859-1", names=columns, usecols=use_cols, header=None)

nut_def.head(5)

In [None]:
# wtf
nut_def[["description", "tagname"]][80:]

# food profiles

In [None]:
#ignore this cell
def group_id(group_name) :
    return food_groups.index[food_groups["food_group_name"] == group_name].tolist()[0]


#retrieve nteresting ids
fruits_group_id = group_id("~Fruits and Fruit Juices~")
vegetables_goup_id = group_id("~Vegetables and Vegetable Products~")
spices_group_id = group_id("~Spices and Herbs~")
nut_group_id = group_id("~Nut and Seed Products~")

print("fruit id : ", fruits_group_id)

#retrieve food description
all_fruits_names = food_des[(food_des["food_group_id"] == fruits_group_id)]
all_fruits_names = all_fruits_names[all_fruits_names["short_description"].str.contains("RAW") 
                                   & ~all_fruits_names["short_description"].str.contains("SMOOTHIE")
                                   & ~all_fruits_names["short_description"].str.contains("CND")]

all_fruits_names



In [None]:
#search carrot
word = "Carrot"
food_des[food_des["long_description"].str.contains(word)]