In [1]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from nltk.tokenize import  word_tokenize

class LabeledLineSentence(object):
    def __init__(self, texts, idxlist):
        self.texts = texts
        self.doc_list = idxlist

    def __iter__(self):
        for idx, text in zip(self.doc_list, self.texts):
            wtok = text
            tags = [idx]

            yield TaggedDocument(words=wtok, tags=tags)
            
def createModel(tagged_documents_iterator):
    model = Doc2Vec(size=5,  min_count=1, sample = 0, iter=200, alpha=0.2)
    model.build_vocab(tagged_documents_iterator)
    model.train(tagged_documents_iterator, total_examples=model.corpus_count, epochs=model.iter)
    return model    

Using TensorFlow backend.


In [2]:
documents = [    
    'I eat an apple.',     # 0
    'I eat an orange.',    # 1
    'I eat rice.',      # 2
    'I drink juice.',   # 3
    'I drink orange juice.',  # 4
    'I drink apple juice.',   # 5
    'I drink milk.',    # 6
    'I drink water.',   # 7
    'I drink rice milk.'      # 8
]

In [3]:
documents_tokenized = [word_tokenize(document.lower()) for document in documents] 


In [4]:
documents_tokenized

[['i', 'eat', 'an', 'apple', '.'],
 ['i', 'eat', 'an', 'orange', '.'],
 ['i', 'eat', 'rice', '.'],
 ['i', 'drink', 'juice', '.'],
 ['i', 'drink', 'orange', 'juice', '.'],
 ['i', 'drink', 'apple', 'juice', '.'],
 ['i', 'drink', 'milk', '.'],
 ['i', 'drink', 'water', '.'],
 ['i', 'drink', 'rice', 'milk', '.']]

In [5]:
tagged_documents_iterator = LabeledLineSentence(documents_tokenized, range(len(documents_tokenized)) )

In [6]:
model = createModel(tagged_documents_iterator)

In [7]:
keyedVector = model.wv

In [8]:
keyedVector.vocab

{'.': <gensim.models.keyedvectors.Vocab at 0x7f3c2d8a3048>,
 'an': <gensim.models.keyedvectors.Vocab at 0x7f3c2d897ef0>,
 'apple': <gensim.models.keyedvectors.Vocab at 0x7f3c2d897f60>,
 'drink': <gensim.models.keyedvectors.Vocab at 0x7f3c2d8a30f0>,
 'eat': <gensim.models.keyedvectors.Vocab at 0x7f3c2d897dd8>,
 'i': <gensim.models.keyedvectors.Vocab at 0x7f3c2d897e10>,
 'juice': <gensim.models.keyedvectors.Vocab at 0x7f3c2d8a3128>,
 'milk': <gensim.models.keyedvectors.Vocab at 0x7f3c2d8a3160>,
 'orange': <gensim.models.keyedvectors.Vocab at 0x7f3c2d8a3080>,
 'rice': <gensim.models.keyedvectors.Vocab at 0x7f3c2d8a30b8>,
 'water': <gensim.models.keyedvectors.Vocab at 0x7f3c2d8a3198>}

In [9]:
for word in keyedVector.vocab:
    print('========== {} ========= '.format(word))
    print(keyedVector.most_similar(positive=[word]), sep='\n')

[('juice', 0.3201598525047302), ('milk', 0.12499019503593445), ('an', 0.09457343071699142), ('orange', 0.04307600483298302), ('rice', -0.0021706614643335342), ('water', -0.18114060163497925), ('apple', -0.2120579034090042), ('.', -0.3113723397254944), ('eat', -0.5972154140472412), ('drink', -0.8786622285842896)]
[('apple', 0.4504159986972809), ('milk', 0.4376160800457001), ('drink', 0.3160449266433716), ('water', 0.23222467303276062), ('.', 0.0488227978348732), ('orange', 0.038920704275369644), ('juice', -0.2445802241563797), ('i', -0.5972153544425964), ('an', -0.6440686583518982), ('rice', -0.7150197625160217)]
[('water', 0.4968716502189636), ('rice', 0.41322988271713257), ('juice', 0.3466440439224243), ('drink', 0.1880747526884079), ('i', 0.09457340091466904), ('.', 0.0297878235578537), ('milk', -0.041588518768548965), ('orange', -0.5435489416122437), ('eat', -0.6440686583518982), ('apple', -0.7428863644599915)]
[('orange', 0.9041184186935425), ('eat', 0.4504159986972809), ('juice', 

In [13]:
from nltk.corpus import stopwords
import string

documents_tokenized = [[x for x in word_tokenize(document.lower()) if x not in stopwords.words('english') + list(string.punctuation)]
                       for document in documents  ] 


In [14]:
documents_tokenized

[['eat', 'apple'],
 ['eat', 'orange'],
 ['eat', 'rice'],
 ['drink', 'juice'],
 ['drink', 'orange', 'juice'],
 ['drink', 'apple', 'juice'],
 ['drink', 'milk'],
 ['drink', 'water'],
 ['drink', 'rice', 'milk']]

In [27]:
tagged_documents_iterator = LabeledLineSentence(documents_tokenized, range(len(documents_tokenized)) )

In [34]:
model = createModel(tagged_documents_iterator)
keyedVector = model.wv

In [35]:
for word in keyedVector.vocab:
    print('========== {} ========= '.format(word))
    print(keyedVector.most_similar(positive=[word]), sep='\n')

[('juice', 0.5720855593681335), ('milk', 0.2762449383735657), ('drink', -0.013058863580226898), ('water', -0.03299867361783981), ('rice', -0.204393208026886), ('orange', -0.41102057695388794), ('apple', -0.5341901779174805)]
[('rice', 0.7592123746871948), ('orange', 0.4552444815635681), ('water', 0.4116889536380768), ('drink', 0.11775501072406769), ('milk', 0.015080234035849571), ('juice', -0.06365883350372314), ('eat', -0.5341902375221252)]
[('water', 0.5209260582923889), ('apple', 0.4552444815635681), ('milk', 0.2392314225435257), ('rice', 0.21330499649047852), ('drink', -0.006312195211648941), ('juice', -0.2098483145236969), ('eat', -0.41102057695388794)]
[('apple', 0.7592123746871948), ('water', 0.44820624589920044), ('orange', 0.21330496668815613), ('juice', 0.0617765337228775), ('drink', 0.04753674566745758), ('milk', -0.1992708295583725), ('eat', -0.2043931782245636)]
[('apple', 0.11775501072406769), ('rice', 0.04753677546977997), ('orange', -0.006312195211648941), ('eat', -0.01