In [57]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from nltk.tokenize import  word_tokenize
from nltk.corpus import stopwords

class LabeledLineSentence(object):
    def __init__(self, texts, idxlist):
        self.texts = texts
        self.doc_list = idxlist

    def __iter__(self):
        for idx, text in zip(self.doc_list, self.texts):
            wtok = text
            tags = [idx]

            yield TaggedDocument(words=wtok, tags=tags)
            
def createModel(tagged_documents_iterator, min_count=1,iter=200):
    model = Doc2Vec(size=8,  min_count=min_count, sample = 0, iter=iter, alpha=0.2)
    model.build_vocab(tagged_documents_iterator)
    model.train(tagged_documents_iterator, total_examples=model.corpus_count, epochs=model.iter)
    return model 

In [41]:
documents = [    
    'I eat an apple',     # 0
    'I eat an orange',    # 1
    'He eats rice',      # 2
    'I drink juice',   # 3
    'I drink orange juice',  # 4
    'I drink apple juice',   # 5
    'He drinks milk',    # 6
    'She drinks water',   # 7
    'I drink rice milk',      # 8
    'Apple are fruits',
    'Oranges are also fruits',
    'Rice is food',
    'Fruits are food',
    'A juice is a drink',
    'Rice milk is a drink',
    'Water is a drink',
    'You can make juice out of fruits'
]

In [42]:
documents_tokenized = [[x for x in word_tokenize(document.lower()) if x not in stopwords.words('english')]
                       for document in documents  ] 

In [43]:
documents_tokenized

[['eat', 'apple'],
 ['eat', 'orange'],
 ['eats', 'rice'],
 ['drink', 'juice'],
 ['drink', 'orange', 'juice'],
 ['drink', 'apple', 'juice'],
 ['drinks', 'milk'],
 ['drinks', 'water'],
 ['drink', 'rice', 'milk'],
 ['apple', 'fruits'],
 ['oranges', 'also', 'fruits'],
 ['rice', 'food'],
 ['fruits', 'food'],
 ['juice', 'drink'],
 ['rice', 'milk', 'drink'],
 ['water', 'drink'],
 ['make', 'juice', 'fruits']]

In [44]:
tagged_documents_iterator = LabeledLineSentence(documents_tokenized, range(len(documents_tokenized)) )
model = createModel(tagged_documents_iterator)
keyedVector = model.wv

In [45]:
for word in keyedVector.vocab:
    print('========== {} ========= '.format(word))
    print(keyedVector.most_similar(positive=[word]), sep='\n')

[('eats', 0.5637511014938354), ('food', 0.5146613717079163), ('oranges', 0.48135727643966675), ('drinks', 0.46551451086997986), ('juice', 0.4203437566757202), ('make', 0.4202868938446045), ('water', 0.2883213460445404), ('milk', 0.25521615147590637), ('fruits', 0.23042337596416473), ('drink', 0.19983536005020142)]
[('make', 0.7851319909095764), ('food', 0.654241681098938), ('oranges', 0.5355478525161743), ('orange', 0.5175880789756775), ('also', 0.4624544382095337), ('eats', 0.3209429383277893), ('milk', 0.305752158164978), ('water', 0.2883443832397461), ('drink', 0.04518195241689682), ('drinks', 0.03469877690076828)]
[('apple', 0.5175880193710327), ('drinks', 0.48081889748573303), ('rice', 0.47761738300323486), ('make', 0.4104869067668915), ('oranges', 0.3590584993362427), ('fruits', 0.33224278688430786), ('water', 0.14942654967308044), ('also', 0.10489951819181442), ('food', 0.08603176474571228), ('drink', 0.058224476873874664)]
[('food', 0.8568220734596252), ('make', 0.7178252935409

In [33]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [49]:
documents_tokenized = [[stemmer.stem(x.lower()) for x in word_tokenize(document) if x.lower() not in stopwords.words('english')]
                       for document in documents  ] 

In [50]:
documents_tokenized 

[['eat', 'appl'],
 ['eat', 'orang'],
 ['eat', 'rice'],
 ['drink', 'juic'],
 ['drink', 'orang', 'juic'],
 ['drink', 'appl', 'juic'],
 ['drink', 'milk'],
 ['drink', 'water'],
 ['drink', 'rice', 'milk'],
 ['appl', 'fruit'],
 ['orang', 'also', 'fruit'],
 ['rice', 'food'],
 ['fruit', 'food'],
 ['juic', 'drink'],
 ['rice', 'milk', 'drink'],
 ['water', 'drink'],
 ['make', 'juic', 'fruit']]

In [51]:
tagged_documents_iterator = LabeledLineSentence(documents_tokenized, range(len(documents_tokenized)) )
model = createModel(tagged_documents_iterator)
keyedVector = model.wv

In [52]:
for word in keyedVector.vocab:
    print('========== {} ========= '.format(word))
    print(keyedVector.most_similar(positive=[word]), sep='\n')

[('food', 0.7528905272483826), ('make', 0.5923361778259277), ('also', 0.5741664171218872), ('milk', 0.4150867462158203), ('drink', 0.3817022442817688), ('water', 0.25823140144348145), ('fruit', 0.2361714094877243), ('appl', 0.06418837606906891), ('juic', -0.010045230388641357), ('orang', -0.09701062738895416)]
[('make', 0.6203815937042236), ('orang', 0.5152387619018555), ('also', 0.4901844561100006), ('food', 0.36059388518333435), ('rice', 0.3116469383239746), ('water', 0.2118978202342987), ('milk', 0.10418567806482315), ('eat', 0.06418837606906891), ('juic', 0.04690834879875183), ('drink', -0.004105009138584137)]
[('rice', 0.7357353568077087), ('appl', 0.5152387619018555), ('make', 0.3480257987976074), ('water', 0.32528382539749146), ('fruit', 0.16470105946063995), ('drink', 0.011485062539577484), ('juic', -0.020891964435577393), ('food', -0.03689652681350708), ('also', -0.05759987235069275), ('milk', -0.07765514403581619)]
[('orang', 0.7357353568077087), ('water', 0.5544337034225464)

In [58]:

model = createModel(tagged_documents_iterator,min_count=2, iter=1000)
keyedVector = model.wv
for word in keyedVector.vocab:
    print('========== {} ========= '.format(word))
    print(keyedVector.most_similar(positive=[word]), sep='\n')

[('fruit', 0.6078680157661438), ('food', 0.5505356788635254), ('milk', 0.2614019513130188), ('juic', 0.22861997783184052), ('drink', 0.1867920309305191), ('water', 0.17487840354442596), ('appl', 0.1271774172782898), ('rice', 0.08494648337364197), ('orang', -0.22146911919116974)]
[('food', 0.45917606353759766), ('milk', 0.2810928523540497), ('orang', 0.2672695815563202), ('water', 0.17115092277526855), ('eat', 0.1271774172782898), ('drink', 0.11658059060573578), ('fruit', 0.04279692843556404), ('rice', -0.011791020631790161), ('juic', -0.17154553532600403)]
[('food', 0.4880123436450958), ('appl', 0.2672696113586426), ('milk', 0.26385942101478577), ('rice', 0.12734302878379822), ('juic', 0.12708032131195068), ('water', 0.12559743225574493), ('drink', -0.1480497121810913), ('eat', -0.22146911919116974), ('fruit', -0.4769309163093567)]
[('fruit', 0.5309787392616272), ('juic', 0.49488186836242676), ('water', 0.46315085887908936), ('orang', 0.12734302878379822), ('milk', 0.08881418406963348)