In [42]:
import sys
print(sys.version)

2.7.13 |Anaconda 4.4.0 (x86_64)| (default, Dec 20 2016, 23:05:08) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]


## 구현에 필요한 함수 모듈들 구축

#### load_embeddings(path), nemb[1:]
#### load_vocab(path), vocab[1:]
#### load_recipes(path), recipes
#### run_tsne(nemb,multicore), tsne.fit_transform(nemb)
#### build_food2cusine(recipes, vocab)
#### make_plot(name, points, labels, legend_labels, legend_order, legend_label_to_color, pretty_legend_label, publish)

In [43]:
import os
import sklearn.manifold
import matplotlib.pyplot as plt
import h5py
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
import numpy as np
import collections
import pandas as pd
import itertools
import seaborn as sns
import time
import json
import re
from MulticoreTSNE import MulticoreTSNE as TSNE
# %load_ext wurlitzer

offline.init_notebook_mode()

flatten = lambda l: [item for sublist in l for item in sublist]

def load_embeddings(path):
    f = h5py.File(path, 'r')
    nemb = f['nemb'][:]
    f.close()
    return nemb[1:]


def load_vocab(path):
    vocab = []
    with open(path, 'r') as f:
        for line in f.readlines():
            split = line.split(' ')
            vocab.append((split[0].replace('\'', ''), int(split[1].rstrip())))
    # ignore UNK at position 0
    return vocab[1:]

def load_recipes(path):
    recipes = []
    with open(path, 'r') as f:
        for line in f:
            if line[0] == '#':
                pass
            else:
                recipes.append(line.rstrip().split(','))
    return recipes

def run_tsne(nemb, multicore=True):
    if multicore:
        tsne = TSNE(n_jobs=4)
    else:
        tsne = sklearn.manifold.TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, verbose=1)
    return tsne.fit_transform(nemb)
    
def build_food2cuisine(recipes, vocab):
    foods = [tup[0] for tup in vocab]
    food_counters = {food: collections.Counter() for food in foods}
    cuisine_counter = collections.Counter()
    for line in recipes:
            cuisine = line[0]
            cuisine_counter.update([cuisine])
            for food in line[1:]:
                if food in foods:
                    food_counters[food].update([cuisine])
    food2cuisine = {}
    for food, food_counter in food_counters.items():
        for cuisine in cuisine_counter.keys():
            food_counter[cuisine] = np.float32(food_counter[cuisine]) / np.float32(cuisine_counter[cuisine])
        sorted_food_counter = sorted(food_counter.items(), key=lambda a: a[1])[::-1]
        print(food, sorted_food_counter[0:2])
        food2cuisine.update({food: sorted_food_counter[0][0]})
    return food2cuisine


# These are the "Tableau 20" colors as RGB.    
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),    
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),    
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),    
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),    
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]    
tableau20_rgb = ['rgb' + str(triplet) for triplet in tableau20]

# Prettify ingredients
pretty_food = lambda s: ' '.join(s.split('_')).capitalize().lstrip()
# Prettify cuisine names
pretty_cuisine = lambda s: ''.join(map(lambda x: x if x.islower() else " "+x, s)).lstrip()


def make_plot(name, points, labels, legend_labels, legend_order, legend_label_to_color, pretty_legend_label, publish):
    lst = zip(points, labels, legend_labels)
    full = sorted(lst, key=lambda x: x[2])
    traces = []
    for legend_label, group in itertools.groupby(full, lambda x: x[2]):
        group_points = []
        group_labels = []
        for tup in group:
            point, label, _ = tup
            group_points.append(point)
            group_labels.append(label)
        group_points = np.stack(group_points)
        traces.append(go.Scattergl(
            x = group_points[:, 0],
            y = group_points[:, 1],
            mode = 'markers',
            marker = dict(
                color = legend_label_to_color[legend_label],
                size = 8,
                opacity = 0.6,
                #line = dict(width = 1)
            ),
            text = ['{} ({})'.format(label, pretty_legend_label(legend_label)) for label in group_labels],
            hoverinfo = 'text',
            name = legend_label
        )
        )
    # order the legend
    ordered = [[trace for trace in traces if trace.name == lab] for lab in legend_order]
    traces_ordered = flatten(ordered)
    def _set_name(trace):
        trace.name = pretty_legend_label(trace.name)
        return trace
    traces_ordered = list(map(_set_name, traces_ordered))
    layout = go.Layout(
        xaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=False,
            showline=False,
            autotick=True,
            ticks='',
            showticklabels=False
        ),
        yaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=False,
            showline=False,
            autotick=True,
            ticks='',
            showticklabels=False
        )
    )
    fig = go.Figure(data=traces_ordered, layout=layout)
    if publish:
        plotter = py.iplot
    else:
        plotter = offline.plot
    plotter(fig, filename=name + '.html')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


## Load Data

In [44]:
# path = '/home/jaan/fit/food2vec/2017-01-24'
path = '../fit/'

# 104534 embedded vectors - unique recipes?
nemb = load_embeddings(os.path.join(path, 'embeddings.h5'))

# 2088 vocabs - unique ingredients
vocab = load_vocab(os.path.join(path, 'vocab.txt'))

food2id = {tup[0]: i for i, tup in enumerate(vocab)}

for i in food2id:
    print i, food2id[i]

equal_sweetener 1306
roasted_tomatoes 672
mackerel 670
stone_ground_cornmeal 1235
tartar_sauce 1081
bucatini 1210
flanken_short_ribs 1751
linguini 1402
sweet_pickle_relish 1498
pizza_doughs 466
pancetta 373
chat_masala 1244
candied_cherries 1593
baking_powder 99
smoked_gouda 954
unsalted_dry_roast_peanuts 666
chocolate_chips 1184
old_el_paso_flour_tortillas 2018
serrano_chilies 564
baby_greens 1121
mild_green_chiles 1384
black_pepper 18
frozen_peaches 1589
flounder_fillets 1419
garden_peas 1179
gremolata 1995
peanut_butter 143
cooking_oil 211
pak_choi 1460
tilapia_fillets 587
low_sodium_taco_seasoning 1642
chambord 1770
idaho_potatoes 1219
jamaican_jerk_spice 1978
yellow_chives 1825
orzo 535
blackening_seasoning 1936
black_mustard_seed_oil 919
water_chestnuts 341
celery_oil 142
red_bean 864
ravva 1437
almond 80
bacon 63
lavender 559
millet 2019
soppressata 1185
chinese_chives 811
powdered_sugar 207
mascarpone 415
milk_fat 149
galangal 550
lima_beans 869
croutons 777
old_el_paso_taco_se

vegan_worcestershire_sauce 2079
mashed_potatoes 641
ravioli 1209
pace_chunky_salsa 1628
cooked_meatballs 1737
graham_cracker_crusts 1441
seasoning_salt 445
red_wine 114
squash 141
arrowroot 1253
pickling_spices 1068
elbow_pasta 1807
sweet_chili_sauce 475
chunky_pasta_sauce 1742
coco 1728
chocolate_shavings 1149
passion_fruit 1087
twists 1962
tortilla_wraps 1952
shredded_carrots 404
casings 1821
red_kidney_bean 660
frozen_mixed_berries 1434
dry_rub 1795
lumpia_wrappers 1199
mild_salsa 1863
dri_leav_rosemari 1869
bows 1877
macaroni 65
catfish 347
queso_asadero 1567
oil 93
canned_chopped_tomatoes 1580
merguez_sausage 1691
fruit 189
tortilla_shells 1521
taro 1378
salmon_roe 1298
southern_comfort 1859
brown_rice 254
flaked 1939
broth 516
tapenade 1311
veal 239
grappa 1546
curry_leaves 330
figs 853
blanched_almonds 524
fresh_green_peas 1503
posole 1835
old_el_paso_enchilada_sauce 1531
cockles 1585
other_vegetables 1946
steamer 1470
cabernet_sauvignon 1588
cilantro 25
rose 504
gram_flour 859


## Plot ingredients
Using tnse, dimention reduction

In [45]:
# don't plot UNK at position 0
low_dim_embs = run_tsne(nemb.astype(np.float64), multicore=False)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 2087
[t-SNE] Computed conditional probabilities for sample 2000 / 2087
[t-SNE] Computed conditional probabilities for sample 2087 / 2087
[t-SNE] Mean sigma: 0.281192
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.272252
[t-SNE] Error after 200 iterations: 1.272252


In [46]:
recipes = load_recipes('../dat/kaggle_and_nature.csv')

for i in recipes[:10]:
    print i

['African', 'chicken', 'cinnamon', 'soy_sauce', 'onion', 'ginger']
['African', 'cane_molasses', 'ginger', 'cumin', 'garlic', 'tamarind', 'bread', 'coriander', 'vinegar', 'onion', 'beef', 'cayenne', 'parsley', 'wheat_bread', 'yogurt', 'vegetable_oil', 'egg']
['African', 'butter', 'pepper', 'onion', 'cardamom', 'cayenne', 'ginger', 'cottage_cheese', 'garlic', 'brassica']
['African', 'olive_oil', 'pepper', 'wheat', 'beef', 'onion', 'cardamom', 'cumin', 'garlic', 'rice', 'leek']
['African', 'honey', 'wheat', 'yeast']
['African', 'tomato', 'cilantro', 'lemon_juice', 'onion', 'cayenne', 'scallion']
['African', 'wheat', 'cassava']
['African', 'olive_oil', 'onion', 'potato', 'black_pepper', 'cumin', 'carrot', 'cabbage', 'turmeric']
['African', 'tomato', 'fenugreek', 'pepper', 'onion', 'potato', 'black_pepper', 'ginger', 'carrot', 'garlic', 'vegetable_oil', 'cabbage', 'turmeric']
['African', 'banana']


In [47]:
food2cuisine = build_food2cuisine(recipes, vocab)

for i in food2cuisine[:10]:
    print i
    

('equal_sweetener', [('SoutheastAsian', 0.00083986565), ('WesternEuropean', 0.00044286979)])
('roasted_tomatoes', [('LatinAmerican', 0.0031954255), ('African', 0.0025575447)])
('mackerel', [('EastAsian', 0.0056489576), ('WesternEuropean', 0.0011809862)])
('stone_ground_cornmeal', [('LatinAmerican', 0.00025227043), ('NorthAmerican', 0.00021813581)])
('tartar_sauce', [('WesternEuropean', 0.00059049309), ('LatinAmerican', 0.00033636056)])
('bucatini', [('SouthernEuropean', 0.0011285089), ('EastAsian', 0.0)])
('flanken_short_ribs', [('NorthernEuropean', 0.0027063598), ('EastAsian', 0.00067249493)])
('linguini', [('SouthernEuropean', 0.00063478627), ('SoutheastAsian', 0.00027995522)])
('sweet_pickle_relish', [('SoutheastAsian', 0.00055991043), ('WesternEuropean', 0.00014762327)])
('pizza_doughs', [('SouthernEuropean', 0.0075469036), ('LatinAmerican', 0.00058863102)])
('pancetta', [('SouthernEuropean', 0.011144026), ('WesternEuropean', 0.0014762327)])
('chat_masala', [('SouthAsian', 0.003316

('pear', [('EastAsian', 0.018426362), ('MiddleEastern', 0.010852713)])
('peas', [('NorthernEuropean', 0.0094722603), ('SouthAsian', 0.0093974574)])
('dashi', [('EastAsian', 0.016946873), ('LatinAmerican', 8.4090141e-05)])
('herbs', [('SoutheastAsian', 0.0058790594), ('SouthAsian', 0.0022111663)])
('brandy', [('WesternEuropean', 0.018600531), ('MiddleEastern', 0.017054264)])
('diced_pimentos', [('NorthAmerican', 0.00013088148), ('LatinAmerican', 8.4090141e-05)])
('light_mayonnaise', [('NorthernEuropean', 0.0027063598), ('LatinAmerican', 0.0018499832)])
('chili_paste', [('SoutheastAsian', 0.013157895), ('EastAsian', 0.0084734363)])
('bocconcini', [('SouthernEuropean', 0.0011990408), ('EastAsian', 0.0)])
('masa', [('LatinAmerican', 0.0021022537), ('EastAsian', 0.0)])
('giardiniera', [('LatinAmerican', 0.00042045073), ('SouthernEuropean', 0.00035265906)])
('golden_raisins', [('African', 0.04177323), ('NorthernEuropean', 0.01623816)])
('(10_oz.)_frozen_chopped_spinach', [('EastAsian', 0.0),

TypeError: unhashable type

In [48]:
with open('../dat/food2cuisine.json', 'w') as f:
    json.dump(food2cuisine, f, indent=2)

In [49]:
cuisines = list(set(food2cuisine.values()))
# np.random.seed(1234)
# tableau20_sample = np.random.choice(tableau20_rgb, len(cuisines), replace=False)
# cuisine2color = {cuisine: tableau20_sample[i] for i, cuisine in enumerate(cuisines)}
cuisine2color = {
    'African': sns.xkcd_rgb["grey"],
    'LatinAmerican': sns.xkcd_rgb["forest green"],
    'NorthAmerican': sns.xkcd_rgb["light pink"],
    'MiddleEastern': sns.xkcd_rgb["mustard yellow"],
    'EastAsian': sns.xkcd_rgb["orange"],
    'SouthAsian': sns.xkcd_rgb["magenta"],
    'SoutheastAsian': sns.xkcd_rgb["purple"],
    'NorthernEuropean': sns.xkcd_rgb["blue"],
    'EasternEuropean': sns.xkcd_rgb["deep blue"],
    'WesternEuropean': sns.xkcd_rgb["sky blue"],
    'SouthernEuropean': sns.xkcd_rgb["olive"],
}
food2color = {food: cuisine2color[food2cuisine[food]] for food in food2cuisine.keys()}

In [50]:
legend_order = [
'African',
'LatinAmerican',
'NorthAmerican',
'EastAsian',
'SouthAsian',
'SoutheastAsian',
'MiddleEastern',
'NorthernEuropean',
'EasternEuropean',
'WesternEuropean',
'SouthernEuropean',
]

In [51]:
labels = [item[0] for item in vocab]
legend_labels = [food2cuisine[food] for food in labels]
labels = [item[0] for item in vocab]
labels = map(pretty_food, labels)
# legend_order = cuisine2color.keys()
make_plot(name='food2vec_food_embeddings_tsne',
          points=low_dim_embs, 
          labels=labels, 
          legend_labels=legend_labels, 
          legend_order=legend_order, 
          legend_label_to_color=cuisine2color, 
          pretty_legend_label=pretty_cuisine,
          publish=False)

0:88: execution error: "file:///Users/asd/coding/food2vec/src/food2vec_food_embeddings_tsne.html"이(가) ‘open location’ 메시지를 인식하지 못합니다. (-1708)


In [None]:
len(vocab)

## Plot recipes
NB: TSNE Takes ~10-30 minutes on 50k recipes

In [None]:
def build_recipe_embedding(recipes, nemb, food2id):
    """Get the recipe embedding.
    
    A recipe's embedding is the mean of its ingredients' embeddings.
    
    Args:
        recipes: list of recipes in the form [cuisine, food1, food2, ...]
        nemb: normalized embeddings
        food2id: map from food string to index in normalized embeddings
    Returns:
        List of tuples, each tuple has form (cuisine, ingredients, recipe embedding)
    """
    recipe_embeddings = []
    for line in recipes:
        cuisine = line.pop(0)
        foods = line
        # check that we have learned the embeddings for all the ingredients
        filtered_foods = [food for food in foods if food in food2id]
        if len(filtered_foods) > 0:
            food_ids = list(map(lambda x: food2id[x], filtered_foods))
            embedding = np.mean(nemb[food_ids], axis=0)
            recipe_embeddings.append((cuisine, foods, embedding))
    return recipe_embeddings

In [None]:
recipe_embeddings = build_recipe_embedding(recipes, nemb, food2id)

In [None]:
# subset = np.random.choice(range(len(recipe_embeddings)), 2000, replace=False)
# small = [recipe_embeddings[idx] for idx in subset]

In [None]:
cuisine_labels, ingredients, embeddings = zip(*recipe_embeddings)
cuisine_labels = list(cuisine_labels)
recipe_nemb = np.vstack(embeddings)

In [None]:
cuisine_counter = collections.Counter(cuisine_labels)

In [None]:
cuisine_counter

In [None]:
recipe_emb_path = os.path.join(path, 'low_dim_recipe_embs.npz')

In [None]:
%load_ext wurlitzer
t0 = time.time()
low_dim_recipe_embs = run_tsne(recipe_nemb.astype(np.float64))
np.savez_compressed(recipe_emb_path, low_dim_recipe_embs)
print('time to run tsne on %d points: %.3f mins' % (len(recipe_nemb), (time.time() - t0) / 60.))

In [None]:
with open(recipe_emb_path, 'rb') as f:
    low_dim_recipe_embs = np.load(f)['arr_0']

In [None]:
# low_dim_recipe_embs = run_fast_tsne(embeddings)
# low_dim_recipe_embs = tsne.bh_sne(embeddings)
# t0 = time.time()
# low_dim_recipe_embs = bhtsne.run_bh_tsne(nemb, no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=False,initial_dims=50, use_pca=True, max_iter=1000)
# print 'time to run tsne on %d points: %.3f mins' % (len(recipe_nemb), (time.time() - t0) / 60.)
low_dim_recipe_embs_list = low_dim_recipe_embs.tolist()

In [None]:
# clean_string = lambda x: re.sub(r'([^\s\w]|_)+', '', x)
recipe_labels = [', '.join([pretty_food(food) for food in foods]).lower().capitalize() for foods in ingredients]

In [None]:
make_plot(name='food2vec_recipe_embeddings_tsne',
          points=low_dim_recipe_embs_list, 
          labels=recipe_labels, 
          legend_labels=cuisine_labels, 
          legend_order=legend_order, 
          legend_label_to_color=cuisine2color, 
          pretty_legend_label=pretty_cuisine,
          publish=False)

## Cuisine embeddings

In [None]:
# cuisine embedding as the average of recipe embeddings:
recipe_embeddings[0]
sorted_recipe_embeddings = sorted(recipe_embeddings, key=lambda x: x[0])
cuisine_embeddings = []
for cuisine_name, group in itertools.groupby(sorted_recipe_embeddings, lambda x: x[0]):
    cuisine_recipe_emb = []
    for tup in group:
        _, _, recipe_emb = tup
        cuisine_recipe_emb.append(recipe_emb)
    all_cuisine_recipe_emb = np.stack(cuisine_recipe_emb)
    cuisine_emb = np.mean(all_cuisine_recipe_emb, axis=0)
    cuisine_embeddings.append((cuisine_name, cuisine_emb))  


# cuisine embedding as the average of food embeddings with highest relative prevalence in that cuisine
# def reverse_dict(mydict):
#     reversed_dict = collections.defaultdict(list)
#     for key,value in mydict.iteritems():
#         reversed_dict[value].append(key)
#     return reversed_dict
# cuisine2foods = reverse_dict(food2cuisine)
# cuisine_embeddings = []
# for cuisine, foods in cuisine2foods.items():
#     food_ids = [food2id[food] for food in foods]
#     food_embs = nemb[food_ids]
#     cuisine_embeddings.append((cuisine, np.mean(food_embs, axis=0)))

In [None]:
t0 = time.time()
cuisine_names, cuisine_emb = zip(*cuisine_embeddings)
cuisine_emb = np.asarray(cuisine_emb)
low_dim_cuisine_embs = run_tsne(cuisine_emb, multicore=False)
print('time to run tsne on %d points: %.3f mins' % (len(cuisine_emb), (time.time() - t0) / 60.))

In [None]:
make_plot(name='food2vec_cuisine_embeddings_tsne',
          points=low_dim_cuisine_embs, 
          labels=cuisine_names, 
          legend_labels=cuisine_names, 
          legend_order=legend_order, 
          legend_label_to_color=cuisine2color, 
          pretty_legend_label=pretty_cuisine,
          publish=False)

## Write foods to json

In [None]:
foods = [tup[0] for tup in vocab]
food2prettyfood = [{"value": food, "text": pretty_food(food)} for food in foods]
food2prettyfood.append([{"value": tup[0], "text": pretty_cuisine(tup[0])} for tup in cuisine_embeddings])
with open(os.path.join(path, 'foods.json'), 'w') as f:
    json.dump(food2prettyfood, f, indent=4)

In [None]:
def write_to_js(words, embeddings, path):
    word_vecs = {}
    for word, embedding in zip(words, embeddings):
        word_vecs[word] = embedding.tolist()
    with open(path, 'w') as f:
        f.write('var wordVecs=')
        json.dump(word_vecs, f)
        f.write(';')
# lower precision, faster
# nemb = nemb.astype(np.float16)
words = [pretty_food(food) for food in foods] + [pretty_cuisine(tup[0]) for tup in cuisine_embeddings]
all_emb = np.vstack([nemb, cuisine_emb])
# '../../word2vecjson/data/foodVecs.js'
write_to_js(words, all_emb, path=os.path.join(path, 'foodVecs.js'))

In [None]:
# print list of foods for autocomplete in assets/js/initm.js
string = str({word: None for word in words})
with open(os.path.join(path, 'javascript_dict.txt'), 'w') as f:
    f.write(string.replace('None', 'null'))