In [10]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from nltk.tokenize import  word_tokenize

class LabeledLineSentence(object):
    def __init__(self, texts, idxlist):
        self.texts = texts
        self.doc_list = idxlist

    def __iter__(self):
        for idx, text in zip(self.doc_list, self.texts):
            wtok = text
            tags = [idx]

            yield TaggedDocument(words=wtok, tags=tags)
            
def createModel(tagged_documents_iterator):
    model = Doc2Vec(size=5,  min_count=1, sample = 0, iter=200, alpha=0.2)
    model.build_vocab(tagged_documents_iterator)
    model.train(tagged_documents_iterator, total_examples=model.corpus_count, epochs=model.iter)
    return model    

In [2]:
documents = [    
    'I eat an apple',     # 0
    'I eat an orange',    # 1
    'I eat rice',      # 2
    'I drink juice',   # 3
    'I drink orange juice',  # 4
    'I drink apple juice',   # 5
    'I drink milk',    # 6
    'I drink water',   # 7
    'I drink rice milk'      # 8
]

In [5]:
documents_tokenized = [word_tokenize(document.lower()) for document in documents] 


In [6]:
documents_tokenized

[['i', 'eat', 'an', 'apple'],
 ['i', 'eat', 'an', 'orange'],
 ['i', 'eat', 'rice'],
 ['i', 'drink', 'juice'],
 ['i', 'drink', 'orange', 'juice'],
 ['i', 'drink', 'apple', 'juice'],
 ['i', 'drink', 'milk'],
 ['i', 'drink', 'water'],
 ['i', 'drink', 'rice', 'milk']]

In [9]:
tagged_documents_iterator = LabeledLineSentence(documents_tokenized, range(len(documents_tokenized)) )

In [11]:
model = createModel(tagged_documents_iterator)

In [12]:
keyedVector = model.wv

In [13]:
keyedVector.vocab

{'an': <gensim.models.keyedvectors.Vocab at 0x7ff8b1105278>,
 'apple': <gensim.models.keyedvectors.Vocab at 0x7ff8b11052e8>,
 'drink': <gensim.models.keyedvectors.Vocab at 0x7ff8b1105320>,
 'eat': <gensim.models.keyedvectors.Vocab at 0x7ff8b11052b0>,
 'i': <gensim.models.keyedvectors.Vocab at 0x7ff8b1105240>,
 'juice': <gensim.models.keyedvectors.Vocab at 0x7ff8b11053c8>,
 'milk': <gensim.models.keyedvectors.Vocab at 0x7ff8b1105400>,
 'orange': <gensim.models.keyedvectors.Vocab at 0x7ff8b1105358>,
 'rice': <gensim.models.keyedvectors.Vocab at 0x7ff8b1105390>,
 'water': <gensim.models.keyedvectors.Vocab at 0x7ff8b1105438>}

In [16]:
for word in keyedVector.vocab:
    print('========== {} ========= '.format(word))
    print(keyedVector.most_similar(positive=[word]), sep='\n')

[('juice', 0.22862210869789124), ('water', -0.019702313467860222), ('milk', -0.03684799373149872), ('eat', -0.35792040824890137), ('apple', -0.35831838846206665), ('rice', -0.4083351194858551), ('orange', -0.4374846816062927), ('an', -0.5614527463912964), ('drink', -0.9800489544868469)]
[('milk', 0.7289857864379883), ('juice', 0.43223679065704346), ('drink', 0.31230786442756653), ('an', 0.10171344876289368), ('orange', 0.055613890290260315), ('water', -0.05191237851977348), ('apple', -0.2622596025466919), ('i', -0.35792040824890137), ('rice', -0.6519403457641602)]
[('rice', 0.49069511890411377), ('drink', 0.42149654030799866), ('juice', 0.18334756791591644), ('eat', 0.10171344876289368), ('water', -0.08065951615571976), ('orange', -0.3188819885253906), ('apple', -0.4349587857723236), ('milk', -0.48578357696533203), ('i', -0.5614527463912964)]
[('orange', 0.7424255609512329), ('drink', 0.49762195348739624), ('rice', 0.3083212375640869), ('water', 0.16873648762702942), ('milk', 0.1501055

In [25]:
from nltk.corpus import stopwords


documents_tokenized = [[x for x in word_tokenize(document.lower()) if x not in stopwords.words('english')]
                       for document in documents  ] 


In [26]:
documents_tokenized

[['eat', 'apple'],
 ['eat', 'orange'],
 ['eat', 'rice'],
 ['drink', 'juice'],
 ['drink', 'orange', 'juice'],
 ['drink', 'apple', 'juice'],
 ['drink', 'milk'],
 ['drink', 'water'],
 ['drink', 'rice', 'milk']]

In [27]:
tagged_documents_iterator = LabeledLineSentence(documents_tokenized, range(len(documents_tokenized)) )

In [34]:
model = createModel(tagged_documents_iterator)
keyedVector = model.wv

In [35]:
for word in keyedVector.vocab:
    print('========== {} ========= '.format(word))
    print(keyedVector.most_similar(positive=[word]), sep='\n')

[('juice', 0.5720855593681335), ('milk', 0.2762449383735657), ('drink', -0.013058863580226898), ('water', -0.03299867361783981), ('rice', -0.204393208026886), ('orange', -0.41102057695388794), ('apple', -0.5341901779174805)]
[('rice', 0.7592123746871948), ('orange', 0.4552444815635681), ('water', 0.4116889536380768), ('drink', 0.11775501072406769), ('milk', 0.015080234035849571), ('juice', -0.06365883350372314), ('eat', -0.5341902375221252)]
[('water', 0.5209260582923889), ('apple', 0.4552444815635681), ('milk', 0.2392314225435257), ('rice', 0.21330499649047852), ('drink', -0.006312195211648941), ('juice', -0.2098483145236969), ('eat', -0.41102057695388794)]
[('apple', 0.7592123746871948), ('water', 0.44820624589920044), ('orange', 0.21330496668815613), ('juice', 0.0617765337228775), ('drink', 0.04753674566745758), ('milk', -0.1992708295583725), ('eat', -0.2043931782245636)]
[('apple', 0.11775501072406769), ('rice', 0.04753677546977997), ('orange', -0.006312195211648941), ('eat', -0.01