In [75]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

In [76]:
documents = [    
    'eat apple',     # 0
    'eat orange',    # 1
    'eat rice',      # 2
    'drink juice',   # 3
    'orange juice',  # 4
    'apple juice',   # 5
    'drink milk',    # 6
    'drink water',   # 7
    'rice milk'      # 8
]

Word2vec detects that words are related, similar or interchangeable looking at the words that are closer to them.
Here we expect to find out similarities between:

* **orange, apple** because you can make **juice** out of them ( #4, #5 )
* **apple, orange, rice** because you can **eat** them ( #0, #1, #2)
* **juice, milk, water** because you can **drink** them ( #3, #6, #7 )
 
Side effect:
* **orange, apple** may be similar to **drink** because of (#3 vs #4, #5 ) 
* **juice** may be similar to **eat** because of (#4, #5 vs #0, #1 ) 
* **milk** may be similar to **eat** because of (#2 vs #8 )
* **rice** may be similar to **drink** because of (#6 vs #8 )


In [77]:
from nltk.tokenize import sent_tokenize, word_tokenize
documents_tokenized = [word_tokenize(document) for document in documents] 


In [78]:
documents_tokenized

[['eat', 'apple'],
 ['eat', 'orange'],
 ['eat', 'rice'],
 ['drink', 'juice'],
 ['orange', 'juice'],
 ['apple', 'juice'],
 ['drink', 'milk'],
 ['drink', 'water'],
 ['rice', 'milk']]

In [79]:
class LabeledLineSentence(object):
    def __init__(self, texts, idxlist):
        self.texts = texts
        self.doc_list = idxlist

    def __iter__(self):
        for idx, text in zip(self.doc_list, self.texts):
            wtok = text
            tags = [idx]

            yield TaggedDocument(words=wtok, tags=tags)

In [80]:
tagged_documents_iterator = LabeledLineSentence(documents_tokenized, range(len(documents_tokenized)) )

In [81]:
list(tagged_documents_iterator)


[TaggedDocument(words=['eat', 'apple'], tags=[0]),
 TaggedDocument(words=['eat', 'orange'], tags=[1]),
 TaggedDocument(words=['eat', 'rice'], tags=[2]),
 TaggedDocument(words=['drink', 'juice'], tags=[3]),
 TaggedDocument(words=['orange', 'juice'], tags=[4]),
 TaggedDocument(words=['apple', 'juice'], tags=[5]),
 TaggedDocument(words=['drink', 'milk'], tags=[6]),
 TaggedDocument(words=['drink', 'water'], tags=[7]),
 TaggedDocument(words=['rice', 'milk'], tags=[8])]

In [98]:
model = Doc2Vec(size=5,  min_count=1, sample = 0, iter=200, alpha=0.2)

In [99]:
model.build_vocab(tagged_documents_iterator)

In [100]:
model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7f6fa93bea90>

In [101]:
model.train(tagged_documents_iterator, total_examples=model.corpus_count, epochs=model.iter)

5400

In [102]:
keyedVector = model.wv

In [107]:
keyedVector.doesnt_match(['apple', 'orange', 'juice'])

'juice'

In [110]:
keyedVector.word_vec('apple'), keyedVector.word_vec('orange'), keyedVector.word_vec('juice') 

(array([-2.20925665, -0.54110104, -1.6180892 ,  3.34981203, -3.73651361], dtype=float32),
 array([ 1.91637743, -2.4059031 , -0.88463014,  3.49807501, -2.15746617], dtype=float32),
 array([ 1.58826435, -0.5545249 , -0.43745354,  1.90960145,  5.19667912], dtype=float32))

In [103]:
keyedVector.most_similar(positive=['apple'])

[('rice', 0.8115032911300659),
 ('drink', 0.6313644647598267),
 ('orange', 0.6110762357711792),
 ('water', 0.20079126954078674),
 ('eat', -0.07000157237052917),
 ('milk', -0.2079600691795349),
 ('juice', -0.465804785490036)]

In [104]:
keyedVector.most_similar(positive=['juice'])

[('milk', 0.8516817688941956),
 ('water', 0.60219407081604),
 ('eat', 0.5291734337806702),
 ('orange', 0.007711499929428101),
 ('rice', -0.19368654489517212),
 ('drink', -0.30208927392959595),
 ('apple', -0.46580472588539124)]

In [105]:
keyedVector.most_similar(positive=['drink'])

[('rice', 0.7943306565284729),
 ('orange', 0.7691922187805176),
 ('apple', 0.6313644647598267),
 ('water', 0.02526184916496277),
 ('eat', -0.16449116170406342),
 ('juice', -0.30208921432495117),
 ('milk', -0.3158668875694275)]

In [96]:
keyedVector.most_similar(positive=['orange'])

[('apple', 0.9975391626358032),
 ('rice', 0.759171187877655),
 ('milk', 0.7426793575286865),
 ('juice', 0.6638700366020203),
 ('water', 0.5512731671333313),
 ('drink', 0.24622079730033875),
 ('eat', -0.6274670362472534)]

In [97]:
keyedVector.most_similar(positive=['eat'])

[('water', 0.30373555421829224),
 ('juice', 0.16575008630752563),
 ('milk', 0.055409371852874756),
 ('rice', 0.030456997454166412),
 ('apple', -0.5713309645652771),
 ('orange', -0.6274670362472534),
 ('drink', -0.9091671109199524)]

In [106]:
keyedVector.most_similar(positive=['water'])

[('milk', 0.8553852438926697),
 ('eat', 0.8403401374816895),
 ('juice', 0.6021940112113953),
 ('orange', 0.2924272418022156),
 ('apple', 0.20079129934310913),
 ('rice', 0.11420619487762451),
 ('drink', 0.02526184916496277)]