#### usual imports

In [17]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import tqdm

#### read kaggle datasets

In [2]:
kaggle_all_ingr = pd.concat((pd.read_json('../data/kaggle/train.json')['ingredients'],
                             pd.read_json('../data/kaggle/test.json')['ingredients']))

#### read cleaned ingredients

In [3]:
ingredients = open("../data/recipeClean/ingredients.txt", mode='r', buffering=-1, encoding="ISO-8859-1", errors=None, newline=None, closefd=True, opener=None)
content = ingredients.readlines()

#### read ingredients mapping 

In [4]:
ing_mapping = json.load(open("./../generated/ing_usda_mapping.json"))

#### read nutritional informations

In [33]:
nut_data = pd.read_hdf('./../generated/nut_data.h5', 'table', where=['index>2'])

## Study Ingredients cleaning loss
Sometimes, as we dropped all the 'junk words', no words remain in the description. How frequent is it ?

In [5]:
ingr_list = list()

for recipe in kaggle_all_ingr:
    for ingr in recipe:
        ingr_list.append(ingr)
        
for i in range(len(content)):
    line = content[i].split('\t')
    if len(line) > 4:
        ingredients = line[4].split('|')
        for ingr in ingredients:
            ingr_list.append(ingr)

In [6]:
n_ingredients_raw = len(ingr_list)

In [7]:
data_path = '../generated/'
cookies_recipes = pd.read_json(data_path + 'clean_cookies_recipes.json')
kaggle_recipes = pd.read_json(data_path + 'clean_kaggle_recipes.json')

In [8]:
cookies_ingredients = cookies_recipes['recipe']
kaggle_ingredients = kaggle_recipes['recipe']

In [9]:
ingr_list = list()

for recipe in kaggle_ingredients:
    for ingr in recipe:
        ingr_list.append(ingr)
for recipe in cookies_ingredients:
    for ingr in recipe:
        ingr_list.append(ingr)

In [10]:
n_ingredients_clean = len(ingr_list)

In [11]:
print("We keep "+str(100*n_ingredients_clean/n_ingredients_raw)[:5]+"% of the ingredients")

We keep 97.99% of the ingredients


## Study USDA ingredients mapping
Are there ingredients that are mapped to the same entry ?

In [32]:
mapped_ids = [ing_mapping[k] for k in ing_mapping]
nb_collisions = len(mapped_ids) - len(set(mapped_ids))

#build dict to store collisions
collisions = {}
for m in tqdm.tqdm(ing_mapping) :
    if ing_mapping[m] not in collisions.keys() :
        collisions[ing_mapping[m]] = [m]
        
    else :
        collisions[ing_mapping[m]].append(m)
    

print("How many ingredients were mapped with at least one other ? ", nb_collisions, "( ", 100 * nb_collisions / len(ing_mapping), "%)")
print("\nHere is an example !\n")
[print(c, collisions[c]) for i, c in enumerate(collisions) if i == 0]
print("\nAfter mapping , we get " + str(len(collisions)) + " ingredients (reduction by factor " + str(len(ing_mapping) /len(collisions))+ ")")

100%|██████████| 4484/4484 [00:00<00:00, 592709.31it/s]

How many ingredients were mapped with at least one other ?  3326 (  74.17484388938448 %)

Here is an example !

2047 ['salt', 'kosher salt', 'sea salt', 'coarse salt', 'salt to taste', 'seasoning salt', 'teaspoon(s) salt', 'coarse kosher salt', 'dash salt', 'coarse sea salt', 'pickling salt', 'dash of salt', 'rock salt', 'sea salt to taste', 'canning salt', 'about salt', 'kosher salt to taste', 'gray salt', 'black salt', 'seasoning salt to taste', 'teaspoon(s) kosher salt', 'himalayan salt', 'maldon sea salt', 'celtic salt', 'hawaiian sea salt', 'morton salt', 'scant salt', 'about kosher salt', 'season salt', 'kosher salt more to taste', 'curing salt', 'margarita salt', "lawry's salt", 'salt ', 'salt substitute', 'teaspoon(s) coarse salt', 'celtic sea salt', 'salt cod']

After mapping , we get 1158 ingredients (reduction by factor 3.872193436960276)





> This is very nice, similar ingredients have been mapped together !