## Doc2Vec
In this notebook we demonstrate how to train a doc2vec model on a custom corpus.

In [None]:
import warnings
warnings.filterwarnings('ignore')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from pprint import pprint
import nltk
nltk.download('punkt')
import time

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
data = ["dog bites man",
        "man bites dog",
        "dog eats meat",
        "man eats food"]

tagged_data = [TaggedDocument(words=word_tokenize(word.lower()), tags=[str(i)]) for i, word in enumerate(data)]


In [None]:
tagged_data

[TaggedDocument(words=['dog', 'bites', 'man'], tags=['0']),
 TaggedDocument(words=['man', 'bites', 'dog'], tags=['1']),
 TaggedDocument(words=['dog', 'eats', 'meat'], tags=['2']),
 TaggedDocument(words=['man', 'eats', 'food'], tags=['3'])]

In [None]:
#dbow
start = time.time()
model_dbow = Doc2Vec(tagged_data,vector_size=20, min_count=1, epochs=2,dm=0)
end = time.time()
print("DBOW Model Training Complete.\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

DBOW Model Training Complete.
Time taken for training is:0.00 hrs 


In [None]:
print(model_dbow.infer_vector(['man','eats','food']))#feature vector of man eats food

[ 1.8149382e-02  7.0132050e-03  5.7288590e-03 -1.2641006e-02
  5.9683891e-03 -2.1047054e-02  2.9954344e-05  2.1688925e-02
 -1.2210529e-02  2.2105617e-02  7.8588845e-03  5.8149495e-03
 -1.4130631e-02  1.8869119e-02 -1.8085594e-03 -1.9677080e-02
 -1.4478344e-02  8.0275619e-03  1.9846799e-02 -1.5995914e-02]


In [None]:
model_dbow.wv.most_similar("man",topn=5)#top 5 most simlar words.

[('meat', 0.31173616647720337),
 ('eats', 0.29588598012924194),
 ('food', 0.11899435520172119),
 ('dog', 0.06447122991085052),
 ('bites', -0.07118572294712067)]

In [None]:
 model_dbow.wv.n_similarity(["dog"],["man"])

0.06447123

In [None]:
#dm
start = time.time()
model_dm = Doc2Vec(tagged_data, min_count=1, vector_size=20, epochs=2,dm=1)
end = time.time()
print("DBOW Model Training Complete.\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

print("Inference Vector of man eats food\n ",model_dm.infer_vector(['man','eats','food']))

print("Most similar words to man in our corpus\n",model_dm.wv.most_similar("man",topn=5))
print("Similarity between man and dog: ",model_dm.wv.n_similarity(["dog"],["man"]))

DBOW Model Training Complete.
Time taken for training is:0.00 hrs 
Inference Vector of man eats food
  [ 1.8149441e-02  7.0133386e-03  5.7287663e-03 -1.2640938e-02
  5.9683323e-03 -2.1047115e-02  3.0079760e-05  2.1689001e-02
 -1.2210527e-02  2.2105699e-02  7.8588324e-03  5.8150296e-03
 -1.4130564e-02  1.8869029e-02 -1.8085958e-03 -1.9677067e-02
 -1.4478297e-02  8.0276234e-03  1.9846911e-02 -1.5995845e-02]
Most similar words to man in our corpus
 [('meat', 0.31173616647720337), ('eats', 0.29588598012924194), ('food', 0.11899435520172119), ('dog', 0.06447122991085052), ('bites', -0.07118572294712067)]
Similarity between man and dog:  0.06447123


What happens when we compare between words which are not in the vocabulary?

In [None]:
# model_dm.wv.n_similarity(['covid'],['man'])