In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import tqdm

### read data

In [63]:
#read cleaned ingredients
ingredients = open("../data/recipeClean/ingredients.txt", mode='r', buffering=-1, encoding="ISO-8859-1", errors=None, newline=None, closefd=True, opener=None)
content = ingredients.readlines()

#read usda mapping
ing_mapping = json.load(open("./../generated/ing_usda_mapping.json"))

#read usda id description
item_describe = json.load(open("./../generated/usda_id_describe.json"))


### study mapping

> Ingredients have been mapped to USDA food items. We first identify the ingredients that have been mapped to the same food item and find a representative name for them.

#### spot collisions

In [3]:
mapped_ids = [ing_mapping[k] for k in ing_mapping]

#build dict to store collisions
collisions = {}

for m in tqdm.tqdm(ing_mapping) :
    if ing_mapping[m] not in collisions.keys() :
        collisions[ing_mapping[m]] = [m]
        
    else :
        collisions[ing_mapping[m]].append(m)

100%|██████████| 4448/4448 [00:00<00:00, 760828.03it/s]


#### find representative name for each group

In [52]:
import collections


item_number = 43
proportion = 0.5

no_representative_keys = []
representative_keys = []

for i, c in tqdm.tqdm(enumerate(collisions)) :
    
    all_items = " ".join(collisions[c]).split(" ")
    counter = collections.Counter()
    counter.update(all_items)

    #find common names
    common_names = [x[0] for x in counter.most_common() if x[1] > len(collisions[c])*proportion]

    #order them if multiple
    if len(common_names) == 0 :
        no_representative_keys.append(c)

    elif len(common_names) == 1 :
        representative_keys.append((c, common_names[0]))

    else :
        #determine order

        #case 1, there exist an entry with only wanted words
        exact_match = [x.split(" ") for x in collisions[c] if (len(set(common_names).difference(set(x.split(" "))))== 0)]
        if len(exact_match) != 0 :
            representative_keys.append((c, " ".join(exact_match[0])))

        #case 2, no exact match
        else :

            all_words_collisions = [x.split(" ") for x in collisions[c] if set(common_names).issubset(set(x.split(" ")))]

            index_tuples = [ (word, index) for collision in all_words_collisions for index, word in enumerate(collision) if (word in common_names)]

            index_counts = np.array([0]*len(common_names))

            #average the relative indices
            for it in index_tuples :
                word_index = common_names.index(it[0])
                index_counts[word_index] += it[1]

            index_counts = index_counts

            common_names_ordered = " ".join([common_names[i] for i in np.argsort(index_counts)])
            representative_keys.append((c, common_names_ordered))



1156it [00:00, 60688.86it/s]


> Some ingredients may have been mapped to the same representative without having the same usda matching. Did it happened ?

In [69]:
con = collections.Counter()
con.update([x[1] for x in representative_keys])
len([c for c in con.most_common() if c[1] >=2])

39

In [70]:
representative_keys

[(2047, 'salt'),
 (11282, 'onion'),
 (1145, 'butter'),
 (14555, 'water'),
 (1123, 'egg'),
 (19334, 'sugar'),
 (2030, 'black pepper'),
 (4053, 'olive oil'),
 (11976, 'pepper'),
 (11215, 'garlic'),
 (20481, 'all-purpose flour'),
 (11695, 'tomato'),
 (1078, 'milk'),
 (4513, 'vegetable'),
 (11165, 'cilantro'),
 (11297, 'parsley'),
 (2050, 'vanilla'),
 (11216, 'ginger'),
 (9152, 'juice of lemon'),
 (16124, 'soy sauce'),
 (11124, 'carrot'),
 (19336, 'sugar'),
 (2014, 'cumin'),
 (2010, 'cinnamon'),
 (11292, 'green onion'),
 (2044, 'basil'),
 (1146, 'parmesan cheese'),
 (2027, 'oregano'),
 (11548, 'tomato'),
 (2049, 'thyme'),
 (5327, 'chicken breast'),
 (11362, 'potato'),
 (5277, 'chicken broth'),
 (14155, 'soda'),
 (9160, 'lime juice'),
 (2020, 'garlic powder'),
 (2009, 'chili powder'),
 (1056, 'sour cream'),
 (2004, 'bay'),
 (2028, 'spice'),
 (16117, 'flour'),
 (1270, 'cheddar cheese'),
 (9150, 'lemon'),
 (2031, 'cayenne pepper'),
 (4058, 'sesame oil'),
 (2025, 'nutmeg'),
 (11333, 'green bel

> It indeed happened, should we map them to the same usda entry ? let's look at the different ingredient sets

In [78]:
for c in con.most_common()[:4] :
    print("Group matching : ", c[0], " (", c[1], " times)")
    matching_keys = []
    for key in representative_keys :
        if key[1]== c[0] :
            print("\t ", item_describe[str(key[0])])
            
            for ing in collisions[key[0]] :
                print("\t\t ", ing)
            print("\n")
            
    print("\n")
    

Group matching :  tomato  ( 5  times)
	  Tomatoes, orange, raw
		  tomato
		  italian tomato
		  or tomato


	  Tomato powder
		  baking powder
		  cherry tomato
		  roma tomato
		  coriander powder
		  tomato purã©e
		  tomato soup
		  vine tomato
		  italian-style tomato
		  heirloom tomato
		  ounce tomato
		  rotel tomato
		  beefsteak tomato
		  fire tomato
		  teaspoon(s) baking powder
		  double-acting baking powder
		  muir glenâ® tomato
		  petite tomato
		  mexican-style tomato
		  ( oz) tomato
		  tomato pesto
		  san marzano tomato
		  oz tomato
		  calumet baking powder
		  tea powder
		  vine-ripe tomato
		  coarsely tomato
		  goldâ® tomato
		  double tomato
		  cream of tomato soup
		  cacao powder
		  -ounce tomato
		  sun tomato
		  teardrop tomato


	  Tomatoes, green, raw
		  green tomato
		  tomato salsa
		  campbell's tomato soup
		  tomato with green chiles
		  tomato with chiles


	  Tomatoes, yellow, raw
		  tomatoes -
		  yellow tomato
		  yellow cherry tomato

In [84]:
for ing in ing_mapping :
    if ing_mapping[ing] == 90480 :
        print(ing)

cane syrup
rice syrup


In [73]:
item_describe

{'2048': 'Vinegar, cider',
 '20481': 'Wheat flour, white, all-purpose, unenriched',
 '2050': 'Vanilla extract',
 '2049': 'Thyme, fresh',
 '2052': 'Vanilla extract, imitation, no alcohol',
 '2053': 'Vinegar, distilled',
 '2054': 'Capers, canned',
 '6150': 'Sauce, barbecue',
 '2055': 'Horseradish, prepared',
 '16398': 'Peanut butter, smooth style, without salt',
 '2063': 'Rosemary, fresh',
 '2064': 'Peppermint, fresh',
 '16399': 'Peanut butter with omega-3, creamy',
 '43026': 'Syrups, sugar free',
 '2039': 'Spices, savory, ground',
 '2068': 'Vinegar, red wine',
 '2069': 'Vinegar, balsamic',
 '14355': 'Beverages, tea, black, brewed, prepared with tap water',
 '2065': 'Spearmint, fresh',
 '43028': 'Jams and preserves, no sugar (with sodium saccharin), any flavor',
 '1226': 'Egg substitute, liquid or frozen, fat free',
 '2074': 'Seasoning mix, dry, sazon, coriander & annatto',
 '2075': 'Seasoning mix, dry, taco, original',
 '2076': 'Seasoning mix, dry, chili, original',
 '43031': 'Candies, 