In [1]:
import gensim
import os
import collections
import smart_open
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import itertools
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline
import time
import seaborn as sns
import numpy as np
import multiprocessing

flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
"""
Load basic ingredients and compounds data

"""

path = 'data'
ingr_info = path + os.sep + 'ingr_info.tsv'
comp_info = path + os.sep + 'comp_info.tsv'
ingr_comp = path + os.sep + 'ingr_comp.tsv'


# {ingredient_id: [ingredient_name, ingredient_category]}
def load_ingredients(path):
    ingredients = {}
    ingredients_list = []
    with open(path, 'r') as f:
        for line in f:
            if line[0] == '#':
                pass
            else:
                line_split = line.rstrip().split('\t')
                ingredients_id = line_split[0]
                ingredients_list = line_split[1:]
                ingredients[ingredients_id] = ingredients_list
    return ingredients

# {compound_id: [compound_name, CAS_number]}
def load_compounds(path):
    compounds = {}
    compounds_list = []
    with open(path, 'r') as f:
        for line in f:
            if line[0] == '#':
                pass
            else:
                line_split = line.rstrip().split('\t')
                compounds_id = line_split[0]
                compounds_list = line_split[1:]
                compounds[compounds_id] = compounds_list
    return compounds

# {ingredient_id: [compound_id1, compound_id2, ...] }
def load_relations(path):
    relations = {}
    with open(path, 'r') as f:
        for line in f:
            if line[0] == '#':
                pass
            else:
                line_split = line.rstrip().split('\t')
                ingredient_id = line_split[0]
                compound_id = line_split[1]
                
                if ingredient_id in relations:
                    relations[ingredient_id].append(compound_id)
                    
                else:
                    relations[ingredient_id] = [compound_id]
                    
    return relations

ingredients = load_ingredients(ingr_info)
compounds = load_compounds(comp_info)
relations = load_relations(ingr_comp)

def ingredient_to_category(tag, ingredients):
    for ingr_id in ingredients:
        if ingredients[ingr_id][0] == tag:
            return ingredients[ingr_id][1]
        else: 
            continue
    return

print ingredient_to_category('copaiba', ingredients)



plant derivative


In [3]:
"""
Load train data and build train_corpus for Doc2Vec

"""
path = 'data'
train_file = path + os.sep + 'ingredient2vec'

def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                line_split = line.split(' ')
                ingredient = line_split[0]
                compounds = ' '.join(line_split[1:])
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(compounds), [ingredient])

# Corpus tag to index
def tag_to_index(tags, corpus):
    for doc_id in range(len(corpus)):
        if tags == corpus[doc_id].tags[0]:
            return doc_id
        else:
            continue
    return
     

# Corpus index to tag                    
def index_to_tag(index, corpus):
    return corpus[index].tags
    

train_corpus = list(read_corpus(train_file))

print index_to_tag(0, train_corpus)
print tag_to_index('ruta_chalepensis_oil', train_corpus)

['ruta_chalepensis_oil']
0


In [4]:
# thresh hold

train_corpus_th10 = []

for doc_id in range(len(train_corpus)):
    if len(train_corpus[doc_id].words) > 10:
        train_corpus_th10.append(train_corpus[doc_id])

In [5]:
"""
Load functions for plotting a graph
"""

# Prettify ingredients
pretty_food = lambda s: ' '.join(s.split('_')).capitalize().lstrip()
# Prettify cuisine names
pretty_category = lambda s: ''.join(map(lambda x: x if x.islower() else " "+x, s)).lstrip()

def make_plot_simple(name, points, labels, publish):
    traces = []
    traces.append(go.Scattergl(
            x = points[:, 0],
            y = points[:, 1],
            mode = 'markers',
            marker = dict(
                color = sns.xkcd_rgb["black"],
                size = 8,
                opacity = 0.6,
                #line = dict(width = 1)
            ),
            text = labels,
            hoverinfo = 'text',
        )
        )
                  
    layout = go.Layout(
        xaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=False,
            showline=False,
            autotick=True,
            ticks='',
            showticklabels=False
        ),
        yaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=False,
            showline=False,
            autotick=True,
            ticks='',
            showticklabels=False
        )
        )
                  
    fig = go.Figure(data=traces, layout=layout)
    if publish:
        plotter = py.iplot
    else:
        plotter = offline.plot
    plotter(fig, filename=name + '.html')

def make_plot(name, points, labels, legend_labels, legend_order, legend_label_to_color, pretty_legend_label, publish):
    lst = zip(points, labels, legend_labels)
    full = sorted(lst, key=lambda x: x[2])
    traces = []
    for legend_label, group in itertools.groupby(full, lambda x: x[2]):
        group_points = []
        group_labels = []
        for tup in group:
            point, label, _ = tup
            group_points.append(point)
            group_labels.append(label)
        group_points = np.stack(group_points)
        traces.append(go.Scattergl(
            x = group_points[:, 0],
            y = group_points[:, 1],
            mode = 'markers',
            marker = dict(
                color = legend_label_to_color[legend_label],
                size = 8,
                opacity = 0.6,
                #line = dict(width = 1)
            ),
            text = ['{} ({})'.format(label, pretty_legend_label(legend_label)) for label in group_labels],
            hoverinfo = 'text',
            name = legend_label
        )
        )
    # order the legend
    ordered = [[trace for trace in traces if trace.name == lab] for lab in legend_order]
    traces_ordered = flatten(ordered)
    def _set_name(trace):
        trace.name = pretty_legend_label(trace.name)
        return trace
    traces_ordered = list(map(_set_name, traces_ordered))
    
    """
    annotations = []
    for index in range(50):
        new_dict = dict(
                x=points[:, 0][index],
                y=points[:, 1][index],
                xref='x',
                yref='y',
                text=labels[index],
                showarrow=True,
                arrowhead=7,
                ax=0,
                ay=-10
            )
        annotations.append(new_dict)
    """
    
    layout = go.Layout(
        xaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=True,
            showline=True,
            autotick=True,
            ticks='',
            showticklabels=False
        ),
        yaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=True,
            showline=True,
            autotick=True,
            ticks='',
            showticklabels=False
        ),
        #annotations=annotations
    )
    fig = go.Figure(data=traces_ordered, layout=layout)
    if publish:
        plotter = py.iplot
    else:
        plotter = offline.plot
    plotter(fig, filename=name + '.html')

In [6]:
"""
Train Doc2Vec Model

"""
time_start = time.time()

cores = multiprocessing.cpu_count()

#dm/m,d50,n5,w5,mc5,s0.001,t3
#model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=5, iter=55)

# load pre-trained character embeddings of flavor compounds
load_name = 'embeddings' + os.sep + 'embeddings_flavor_compounds_50dim.bin'

model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=5, iter=55)
model.build_vocab(train_corpus_th10)

#print model.docvecs.index_to_doctag(2), model.docvecs[2]

model.intersect_word2vec_format(load_name, lockf=1.0, binary=True, encoding='utf8', unicode_errors='strict')

%time model.train(train_corpus_th10, total_examples=model.corpus_count, epochs=model.iter)

print "Corpus_count:", model.corpus_count
print 'Doc2Vec training done! Time elapsed: {} seconds'.format(time.time()-time_start)

CPU times: user 3.56 s, sys: 641 ms, total: 4.2 s
Wall time: 2.34 s
Corpus_count: 514
Doc2Vec training done! Time elapsed: 2.41093397141 seconds


In [None]:
"""
Check rank of inferred_vector

"""

ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(train_corpus[doc_id].tags[0])
    ranks.append(rank)
    
    second_ranks.append(sims[1])

In [None]:
"""
Pick a random document from the test corpus and infer a vector from the model
Top 10 Similar Vector

"""
doc_id = random.randint(0, len(train_corpus))

print('Train Document ({}, {}): [{}]\n'.format(doc_id, train_corpus[doc_id].tags[0], ' '.join(train_corpus[doc_id].words)))

inferred_vector = model.infer_vector(train_corpus[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=10)
for sim in sims:
    print sim

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(train_corpus))
inferred_vector = model.infer_vector(train_corpus[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Train Document ({}, {}): [{}]\n'.format(doc_id, train_corpus[doc_id].tags[0], ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print label, sims[index]
    #print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

In [None]:
# Pick a random document from the test corpus and infer a vector from the model

doc_id = random.randint(0, len(train_corpus))

inferred_vector = model.infer_vector(train_corpus[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Train Document ({}, {}): [{}]\n'.format(doc_id, train_corpus[doc_id].tags[0], ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print label, sims[index]
    #print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))


In [7]:
"""
TSNE of Doc2Vec

"""
time_start = time.time()
X = model.docvecs
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

print 't-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start)

t-SNE done! Time elapsed: 2.22845196724 seconds


In [15]:
print type(X[0])
print type(X)

<type 'numpy.ndarray'>
<class 'gensim.models.doc2vec.DocvecsArray'>


In [251]:
labels = []
categories = []

for doc_id in range(0, len(model.docvecs)):
    labels.append(model.docvecs.index_to_doctag(doc_id))

for label in labels:
    categories.append(ingredient_to_category(label,ingredients))

categories_color = list(set(categories))
print categories_color

category2color = {
    'plant' :  sns.xkcd_rgb["purple"],
    'flower' : sns.xkcd_rgb["forest green"],
    'meat' : sns.xkcd_rgb["light pink"],
    'nut/seed/pulse' : sns.xkcd_rgb["mustard yellow"],
    'herb' : sns.xkcd_rgb["orange"],
    'alcoholic beverage' : sns.xkcd_rgb["magenta"],
    'plant derivative' : sns.xkcd_rgb["purple"],
    'fruit' : sns.xkcd_rgb["blue"],
    'dairy' : sns.xkcd_rgb["deep blue"],
    'cereal/crop' : sns.xkcd_rgb["sky blue"],
    'vegetable' : sns.xkcd_rgb["olive"],
    'animal product' : sns.xkcd_rgb["red"],
    'fish/seafood' : sns.xkcd_rgb["yellow"],
    'spice' : sns.xkcd_rgb["black"],
}

category_order = [
'plant',
'flower',
'meat',
'nut/seed/pulse',
'herb',
'alcoholic beverage',
'plant derivative',
'fruit',
'dairy',
'cereal/crop',
'vegetable',
'animal product',
'fish/seafood',
'spice',
]

['plant', 'flower', 'meat', 'herb', 'alcoholic beverage', 'plant derivative', 'fruit', 'dairy', 'fish/seafood', 'vegetable', 'spice', 'animal product', 'cereal/crop', 'nut/seed/pulse']


In [252]:
make_plot(name='ingredient2vec_2',
          points=X_tsne, 
          labels=labels, 
          legend_labels=categories, 
          legend_order=category_order, 
          legend_label_to_color=category2color, 
          pretty_legend_label=pretty_category,
          publish=False)

In [None]:
"""
compound-level plotting

""" 
X_comp = model[model.wv.vocab]
tsne_comp = TSNE(n_components=2)
X_tsne_comp = tsne_comp.fit_transform(X_comp)

labels_comp =[]
for comp in model.wv.vocab:
    labels_comp.append(comp)

make_plot_simple(name='food2vec_food_embeddings_tsne_comp',
          points=X_tsne_comp, 
          labels=labels_comp, 
          publish=False)