In [1]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
# 示例文本数据
corpus = [
    "The quick brown fox jumps over the lazy dog",
    "The dog is loyal and friendly",
    "Foxes are wild animals",
    "Dogs are domestic animals",
    "The quick dog runs fast"
]

In [3]:
# 将文本数据分词
tokenized_corpus = [simple_preprocess(sentence) for sentence in corpus]

# 设置训练参数
# CBOW: sg=0（默认值）； Skip-gram: sg=1
cbow_model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=3, min_count=1, sg=0)
skipgram_model = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=3, min_count=1, sg=1)

In [4]:
# 查看训练后的结果
print("CBOW - 单词'vector'的嵌入向量：")
print(cbow_model.wv['dog'])

print("\nSkip-gram - 单词'vector'的嵌入向量：")
print(skipgram_model.wv['dog'])

CBOW - 单词'vector'的嵌入向量：
[-0.01632377  0.00899723 -0.0082776   0.00164806  0.0169978  -0.00892552
  0.00903828 -0.01357125 -0.00709999  0.01879458 -0.00315838  0.00064812
 -0.0082836  -0.01537097 -0.00301849  0.00493323 -0.00177727  0.01106436
 -0.00548781  0.0045157   0.01091042  0.01668961 -0.00289765 -0.01840967
  0.00873266  0.00114314  0.01488784 -0.00162453 -0.00528064 -0.01750128
 -0.00170723  0.00564999  0.01079871  0.01409719 -0.01140061  0.00371827
  0.01217871 -0.00959487 -0.00621711  0.01360083  0.00326821  0.00037685
  0.00694213  0.00044185  0.01924861  0.01011406 -0.01783598 -0.01408106
  0.0018089   0.01278117]

Skip-gram - 单词'vector'的嵌入向量：
[-0.0163211   0.00899741 -0.00827723  0.00165442  0.01699344 -0.00893559
  0.00903771 -0.01357584 -0.00710076  0.01878385 -0.0031562   0.00064877
 -0.00827279 -0.0153748  -0.00301885  0.00493467 -0.00177791  0.01105483
 -0.00549633  0.00451217  0.01090997  0.01669871 -0.00289925 -0.01841382
  0.00873159  0.00113915  0.01489131 -0.0016

In [5]:
# 找到与“dog”最相似的单词
print("\nCBOW - 与 'dog' 最相似的单词：")
print(cbow_model.wv.most_similar('dog'))

print("\nSkip-gram - 与 'dog' 最相似的单词：")
print(skipgram_model.wv.most_similar('dog'))


CBOW - 与 'dog' 最相似的单词：
[('wild', 0.2297576367855072), ('loyal', 0.16089019179344177), ('domestic', 0.14870333671569824), ('are', 0.12487586587667465), ('runs', 0.08053992688655853), ('brown', 0.07405462116003036), ('the', 0.042374081909656525), ('is', 0.018290217965841293), ('friendly', 0.01149215828627348), ('quick', 0.011124671436846256)]

Skip-gram - 与 'dog' 最相似的单词：
[('wild', 0.2297777384519577), ('loyal', 0.16113805770874023), ('domestic', 0.14879682660102844), ('are', 0.12487754970788956), ('runs', 0.08048326522111893), ('brown', 0.07418747246265411), ('the', 0.042442936450242996), ('is', 0.018418842926621437), ('friendly', 0.01174618024379015), ('quick', 0.011074387468397617)]
