In [47]:
# import libraries
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [4]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hawan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [48]:
# 标记我们的数据

data = ["The process of searching for a job can be very stressful, but it doesn’t have to be. Start with a\
        well-written resume that has appropriate keywords for your occupation. Next, conduct a targeted job search\
        for positions that meet your needs.",
        "Gardening in mixed beds is a great way to get the most productivity from a small space. Some investment\
        is required, to purchase materials for the beds themselves, as well as soil and compost. The\
        investment will likely pay-off in terms of increased productivity.",
        "Looking for a job can be very stressful, but it doesn’t have to be. Begin by writing a good resume with\
        appropriate keywords for your occupation. Second, target your job search for positions that match your\
        needs."]

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]


In [49]:
print(tagged_data)

[TaggedDocument(words=['the', 'process', 'of', 'searching', 'for', 'a', 'job', 'can', 'be', 'very', 'stressful', ',', 'but', 'it', 'doesn', '’', 't', 'have', 'to', 'be', '.', 'start', 'with', 'a', 'well-written', 'resume', 'that', 'has', 'appropriate', 'keywords', 'for', 'your', 'occupation', '.', 'next', ',', 'conduct', 'a', 'targeted', 'job', 'search', 'for', 'positions', 'that', 'meet', 'your', 'needs', '.'], tags=['0']), TaggedDocument(words=['gardening', 'in', 'mixed', 'beds', 'is', 'a', 'great', 'way', 'to', 'get', 'the', 'most', 'productivity', 'from', 'a', 'small', 'space', '.', 'some', 'investment', 'is', 'required', ',', 'to', 'purchase', 'materials', 'for', 'the', 'beds', 'themselves', ',', 'as', 'well', 'as', 'soil', 'and', 'compost', '.', 'the', 'investment', 'will', 'likely', 'pay-off', 'in', 'terms', 'of', 'increased', 'productivity', '.'], tags=['1']), TaggedDocument(words=['looking', 'for', 'a', 'job', 'can', 'be', 'very', 'stressful', ',', 'but', 'it', 'doesn', '’', '

In [50]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=2, epochs=80)

In [51]:
model.build_vocab(tagged_data)

In [67]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [53]:
model.save("d2v.model")

In [54]:
# 从模型中加载
gensim.models.Doc2Vec.load("d2v.model")

<gensim.models.doc2vec.Doc2Vec at 0x236322b5c40>

In [57]:
# 计算相似度
similar_doc = model.docvecs.most_similar('2')

print(similar_doc[0])

('0', 0.9393066167831421)


  similar_doc = model.docvecs.most_similar('2')


In [59]:
model.docvecs.similarity('0', '1')

  model.docvecs.similarity('0', '1')


0.91996104

In [60]:
model.docvecs.similarity('0', '2')

  model.docvecs.similarity('0', '2')


0.9393066

In [61]:
model.docvecs.similarity('1', '2')

  model.docvecs.similarity('1', '2')


0.93899655

In [26]:
model.docvecs.similarity('1', '3')

  model.docvecs.similarity('1', '3')


KeyError: "Key '3' not present"

In [62]:
test_data = word_tokenize(
    "Gardening in mixed beds is a great way to get the most productivity from a small space. Some investment is required, to purchase materials for the beds themselves, as well as soil and compost. The investment will likely pay-off in terms of increased productivity.".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

V1_infer [-0.17298579  0.127903    0.14399172  0.0474085  -0.03293706 -0.01273019
 -0.10772055 -0.04456817 -0.54189944 -0.06661257  0.09414992 -0.03048338
 -0.12387884 -0.47155198 -0.02964431 -0.26211956  0.11188516 -0.0056281
 -0.21184742 -0.14939156  0.15217604  0.02204626 -0.1879678   0.12445168
  0.31978175  0.11734581  0.10158727 -0.00155555  0.05369437 -0.25901806]


In [65]:
# to find most similar doc to test data
similar_doc = model.docvecs.most_similar([v1])

  similar_doc = model.docvecs.most_similar(v1)


In [66]:
print(similar_doc)

[('0', 0.9774290323257446), ('2', 0.9693566560745239), ('1', 0.958945631980896)]
