In [2]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

In [14]:
documents = [    
    'eat apple',     # 0
    'eat orange',    # 1
    'eat rice',      # 2
    'drink juice',   # 3
    'orange juice',  # 4
    'apple juice',   # 5
    'drink milk',    # 6
    'drink water',   # 7
    'rice milk'      # 8
]

Word2vec detects that words are related, similar or interchangeable looking at the words that are closer to them.
Here we expect to find out similarities between:

* **orange, apple** because you can make **juice** out of them ( #4, #5 )
* **apple, orange, rice** because you can **eat** them ( #0, #1, #2)
* **juice, milk, water** because you can **drink** them ( #3, #6, #7 )
 
Side effect:
* **orange, apple** may be similar to **drink** because of (#3 vs #4, #5 ) 
* **juice** may be similar to **eat** because of (#4, #5 vs #0, #1 ) 
* **milk** may be similar to **eat** because of (#2 vs #8 )
* **rice** may be similar to **drink** because of (#6 vs #8 )


In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize
documents_tokenized = [word_tokenize(document) for document in documents] 


In [16]:
documents_tokenized

[['eat', 'apple'],
 ['eat', 'orange'],
 ['eat', 'rice'],
 ['drink', 'juice'],
 ['orange', 'juice'],
 ['apple', 'juice'],
 ['drink', 'milk'],
 ['drink', 'water']]

In [17]:
class LabeledLineSentence(object):
    def __init__(self, texts, idxlist):
        self.texts = texts
        self.doc_list = idxlist

    def __iter__(self):
        for idx, text in zip(self.doc_list, self.texts):
            wtok = text
            tags = [idx]

            yield TaggedDocument(words=wtok, tags=tags)

In [18]:
tagged_documents_iterator = LabeledLineSentence(documents_tokenized, range(len(documents_tokenized)) )

In [19]:
list(tagged_documents_iterator)


[TaggedDocument(words=['eat', 'apple'], tags=[0]),
 TaggedDocument(words=['eat', 'orange'], tags=[1]),
 TaggedDocument(words=['eat', 'rice'], tags=[2]),
 TaggedDocument(words=['drink', 'juice'], tags=[3]),
 TaggedDocument(words=['orange', 'juice'], tags=[4]),
 TaggedDocument(words=['apple', 'juice'], tags=[5]),
 TaggedDocument(words=['drink', 'milk'], tags=[6]),
 TaggedDocument(words=['drink', 'water'], tags=[7])]

In [68]:
model = Doc2Vec(size=10,  min_count=1, sample = 0, iter=200, alpha=0.2)

In [69]:
model.build_vocab(tagged_documents_iterator)

In [70]:
model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7f6f908905c0>

In [71]:
model.train(tagged_documents_iterator, total_examples=model.corpus_count, epochs=model.iter)

4800

In [72]:
keyedVector = model.wv

In [73]:
keyedVector.most_similar(positive=['apple'])

[('orange', 0.8001600503921509),
 ('rice', 0.625898540019989),
 ('drink', 0.5313394069671631),
 ('milk', 0.31874215602874756),
 ('water', 0.13868723809719086),
 ('eat', -0.16934148967266083),
 ('juice', -0.3660101592540741)]

In [74]:
keyedVector.most_similar(positive=['juice'])

[('water', 0.751163125038147),
 ('milk', 0.5993781685829163),
 ('eat', 0.4183531701564789),
 ('rice', 0.358722448348999),
 ('orange', -0.13135085999965668),
 ('apple', -0.3660100996494293),
 ('drink', -0.4774039089679718)]

In [58]:
keyedVector.most_similar(positive=['drink'])

[('apple', 0.7836894989013672),
 ('orange', 0.7691080570220947),
 ('juice', 0.744430661201477),
 ('eat', 0.7242857217788696),
 ('milk', 0.5349156260490417),
 ('water', 0.5061863660812378),
 ('rice', 0.4724278450012207)]

In [66]:
keyedVector.most_similar(positive=['orange'])

[('rice', 0.874415397644043),
 ('apple', 0.6485577821731567),
 ('drink', 0.3787643313407898),
 ('milk', 0.35988014936447144),
 ('water', -0.017235398292541504),
 ('juice', -0.11514444649219513),
 ('eat', -0.3552086651325226)]

In [67]:
keyedVector.most_similar(positive=['eat'])

[('water', 0.6505352258682251),
 ('juice', 0.3340483605861664),
 ('milk', 0.1099378913640976),
 ('drink', 0.04004232585430145),
 ('apple', 0.0076531171798706055),
 ('rice', -0.03144672140479088),
 ('orange', -0.35520869493484497)]