In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import tqdm

### read data

In [2]:
#whether we do the work with low_score or high_score mappings
high_score = True

In [3]:
#read usda mapping
if high_score :
    ing_mapping = json.load(open("./../generated/ing_usda_mapping_high_score.json"))
else :
    ing_mapping = json.load(open("./../generated/ing_usda_mapping_low_score.json"))

#read usda id description
item_describe = json.load(open("./../generated/usda_id_describe.json"))

#read cleaned recipes
cleaned_kaggle_recipes = json.load(open("./../generated/clean_kaggle_recipes.json"))['recipe']
cleaned_cookies_recipes = json.load(open("./../generated/clean_cookies_recipes.json"))['recipe']

cleaned_kaggle_recipes  = [cleaned_kaggle_recipes[c]  for c in cleaned_kaggle_recipes]
cleaned_cookies_recipes = [cleaned_cookies_recipes[c] for c in cleaned_cookies_recipes]

#read ingredients count
ingredients_count = json.load(open("./../generated/ingredients_count.json"))['count']


### study mapping

> Ingredients have been mapped to USDA food items. We first identify the ingredients that have been mapped to the same food item and find a representative name for them.

#### spot collisions

In [4]:
mapped_ids = [ing_mapping[k] for k in ing_mapping]

#build dict to store collisions
collisions = {}

for m in tqdm.tqdm(ing_mapping) :
    if ing_mapping[m] not in collisions.keys() :
        collisions[ing_mapping[m]] = [m]
        
    else :
        collisions[ing_mapping[m]].append(m)

100%|██████████| 1643/1643 [00:00<00:00, 290610.28it/s]


In [5]:
collisions

{2047: ['salt'],
 11282: ['onion', 'raw onion'],
 1145: ['butter'],
 14555: ['water'],
 1123: ['egg', 'raw egg'],
 19908: ['sugar', 'turbinado', 'turbinado sugar'],
 2030: ['black pepper'],
 4053: ['olive oil', 'salad oil'],
 11981: ['pepper'],
 11215: ['garlic'],
 20481: ['all-purpose flour', 'all-purpose', 'all-purpose white flour'],
 11695: ['tomato'],
 1212: ['milk'],
 4669: ['vegetable oil'],
 11165: ['cilantro'],
 11297: ['parsley'],
 2050: ['vanilla'],
 11216: ['ginger', 'ginger root'],
 9152: ['lemon juice'],
 19334: ['brown sugar'],
 16124: ['soy sauce', 'tamari soy sauce', 'tamari'],
 11124: ['carrot', 'raw carrot'],
 2014: ['cumin', 'cumin seed'],
 28315: ['cinnamon'],
 11292: ['green onion', 'onion top', 'green onion top'],
 2044: ['basil'],
 1146: ['parmesan cheese'],
 2027: ['oregano'],
 18371: ['baking powder'],
 2049: ['thyme'],
 7935: ['chicken breast'],
 11362: ['potato', 'raw potato'],
 6194: ['chicken broth', 'soup'],
 18372: ['baking soda'],
 9160: ['lime juice'],


#### find representative name for each group

In [6]:
import collections


item_number = 43
proportion = 0.5

representative_keys = dict()

for i, c in tqdm.tqdm(enumerate(collisions)) :
    
    all_items = " ".join(collisions[c]).split(" ")
    counter = collections.Counter()
    counter.update(all_items)

    #find common names
    common_names = [x[0] for x in counter.most_common() if x[1] > len(collisions[c])*proportion]

    #choose database description
    if len(common_names) == 0 :
        representative_keys[c] = item_describe[str(c)]

    elif len(common_names) == 1 :
        representative_keys[c] = common_names[0]

    else :
        #determine order

        #case 1, there exist an entry with only wanted words
        exact_match = [x.split(" ") for x in collisions[c] if (len(set(common_names).difference(set(x.split(" "))))== 0)]
        if len(exact_match) != 0 :
            representative_keys[c] = " ".join(exact_match[0])

        #case 2, no exact match
        else :

            all_words_collisions = [x.split(" ") for x in collisions[c] if set(common_names).issubset(set(x.split(" ")))]

            index_tuples = [ (word, index) for collision in all_words_collisions for index, word in enumerate(collision) if (word in common_names)]

            index_counts = np.array([0]*len(common_names))

            #average the relative indices
            for it in index_tuples :
                word_index = common_names.index(it[0])
                index_counts[word_index] += it[1]

            index_counts = index_counts

            common_names_ordered = " ".join([common_names[i] for i in np.argsort(index_counts)])
            representative_keys[c] = common_names_ordered



1260it [00:00, 152926.18it/s]


In [7]:
representative_keys

{2047: 'salt',
 11282: 'onion',
 1145: 'butter',
 14555: 'water',
 1123: 'egg',
 19908: 'turbinado sugar',
 2030: 'black pepper',
 4053: 'oil',
 11981: 'pepper',
 11215: 'garlic',
 20481: 'all-purpose flour',
 11695: 'tomato',
 1212: 'milk',
 4669: 'vegetable oil',
 11165: 'cilantro',
 11297: 'parsley',
 2050: 'vanilla',
 11216: 'ginger',
 9152: 'lemon juice',
 19334: 'brown sugar',
 16124: 'tamari soy sauce',
 11124: 'carrot',
 2014: 'cumin',
 28315: 'cinnamon',
 11292: 'green onion top',
 2044: 'basil',
 1146: 'parmesan cheese',
 2027: 'oregano',
 18371: 'baking powder',
 2049: 'thyme',
 7935: 'chicken breast',
 11362: 'potato',
 6194: 'Soup, chicken broth, ready-to-serve',
 18372: 'baking soda',
 9160: 'lime juice',
 2020: 'garlic powder',
 2009: 'chili powder',
 1179: 'sour cream',
 2004: 'bay',
 2028: 'Spices, paprika',
 20063: 'flour',
 1270: 'cheddar',
 9156: 'lemon',
 2031: 'cayenne',
 4058: 'sesame oil',
 4572: 'nutmeg',
 25070: 'shrimp',
 19296: 'honey',
 9159: 'lime',
 4532:

> Some ingredients may have been mapped to the same representative without having the same usda matching. Did it happened ?

In [8]:
con = collections.Counter()
con.update([representative_keys[x] for x in representative_keys])
len([c for c in con.most_common() if c[1] >=2])

24

In [9]:
[(c, representative_keys[c]) for c in representative_keys][:10]

[(2047, 'salt'),
 (11282, 'onion'),
 (1145, 'butter'),
 (14555, 'water'),
 (1123, 'egg'),
 (19908, 'turbinado sugar'),
 (2030, 'black pepper'),
 (4053, 'oil'),
 (11981, 'pepper'),
 (11215, 'garlic')]

> It indeed happened, should we map them to the same usda entry ? let's look at the different ingredient sets

In [10]:
for c in con.most_common()[:10] :
    print("Representative term : ", c[0], " (", c[1], " times)")
    matching_keys = []
    for key in representative_keys :
        if representative_keys[key]== c[0] :
            print("\tusda food item : ", item_describe[str(key)])
            
            for ing in collisions[key] :
                print("\t\tingredient : ", ing)
            print("\n")
            
    print("\n")
    

Representative term :  oil  ( 3  times)
	usda food item :  Oil, olive, salad or cooking
		ingredient :  olive oil
		ingredient :  salad oil


	usda food item :  Oil, hazelnut
		ingredient :  oil
		ingredient :  hazelnut oil


	usda food item :  Oil, corn and canola
		ingredient :  canola oil
		ingredient :  corn oil




Representative term :  flour  ( 3  times)
	usda food item :  Rye flour, dark
		ingredient :  flour


	usda food item :  Wheat flour, white, cake, enriched
		ingredient :  cake flour
		ingredient :  white wheat flour


	usda food item :  Pasta, gluten-free, corn flour and quinoa flour, cooked, ANCIENT HARVEST
		ingredient :  gluten-free flour
		ingredient :  quinoa flour




Representative term :  mushroom  ( 3  times)
	usda food item :  Mushrooms, maitake, raw
		ingredient :  mushroom


	usda food item :  Mushrooms, brown, italian, or crimini, raw
		ingredient :  crimini mushroom
		ingredient :  brown mushroom


	usda food item :  Mushrooms, portabella, raw
		ingredient

> We choose to remember the representative string because we find it good

In [11]:
if high_score :
    json.dump(representative_keys, open("./../generated/high_score_key_representative.json" , 'w'))
else :
    json.dump(representative_keys, open("./../generated/low_score_key_representative.json" , 'w'))

### rewrite recipes with usda_id, or original name if no id found

In [12]:
frequent_ingredients = set(ingredients_count.keys())
usda_mapped = set([ing for ing in ing_mapping])
deleted = 0

a = cleaned_kaggle_recipes.copy()
a.extend(cleaned_cookies_recipes.copy())

representative_count = 0
unmapped_count = 0
total_count = 0
all_repr_recipes = []

for recipe in a :
    repr_ingredients = []

    for ing in recipe :
        if ing in frequent_ingredients :

            total_count += 1

            if ing in usda_mapped :
                usda_id = ing_mapping[ing]
                repr_ingredients.append("usda_id=" + str(usda_id)) 
                representative_count += 1
               
            else :
                repr_ingredients.append(ing)
                unmapped_count += 1

    if len(repr_ingredients) != 0 :
        all_repr_recipes.append(repr_ingredients)
    else :
        deleted += 1
    
repr_perc = 100 * representative_count / total_count
unmapped_perc = 100*unmapped_count / total_count
print("representative : ", repr_perc, "%")
print("unmapped : ", unmapped_perc, "%")
print(deleted, " recipes were deleted :(")

#save result
if high_score :
    json.dump(all_repr_recipes, open("./../generated/high_score_repr_recipes.json", 'w'))
else :
    json.dump(all_repr_recipes, open("./../generated/low_score_repr_recipes.json", 'w'))

representative :  79.53772509292816 %
unmapped :  20.46227490707185 %
234  recipes were deleted :(


In [13]:
print("Final nb of ingredients : ", len(set([ingr for rec in all_repr_recipes for ingr in rec])))
kaggle_nb = len(set([ingr for rec in cleaned_kaggle_recipes for ingr in rec]))
cookies_nb = len(set([ingr for rec in cleaned_cookies_recipes for ingr in rec]))
print("orignial nb : ", kaggle_nb + cookies_nb, " (=", kaggle_nb, "+", cookies_nb, ")" )

Final nb of ingredients :  4658
orignial nb :  32960  (= 5422 + 27538 )
