利用gensim训练word2vec

In [1]:
import jieba
from gensim import models

#### 载入文本数据

In [2]:
with open('./data/in_the_name_of_people.txt') as f:
    document = f.read()

#### 分词处理

In [3]:
# 添加强制分词
jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('田国富', True)
jieba.suggest_freq('高育良', True)
jieba.suggest_freq('侯亮平', True)
jieba.suggest_freq('钟小艾', True)
jieba.suggest_freq('陈岩石', True)
jieba.suggest_freq('欧阳菁', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孙连城', True)
jieba.suggest_freq('季昌明', True)
jieba.suggest_freq('丁义珍', True)
jieba.suggest_freq('郑西坡', True)
jieba.suggest_freq('赵东来', True)
jieba.suggest_freq('高小琴', True)
jieba.suggest_freq('赵瑞龙', True)
jieba.suggest_freq('林华华', True)
jieba.suggest_freq('陆亦可', True)
jieba.suggest_freq('刘新建', True)
jieba.suggest_freq('刘庆祝', True)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/rk/nnl9yhm55kb6325ckffkn6hw0000gn/T/jieba.cache
Loading model cost 0.666 seconds.
Prefix dict has been built succesfully.


1

In [5]:
# 对于word2vec需谨慎做stopwords，因为其训练是依赖先后词的，但是一些符号也需要考虑进行去处
# 这里为了方便，未做停用词处理
document_cut = jieba.lcut(document)
document_cut[:10]

['\ufeff', ' ', '\n', ' ', '\n', ' ', '人民', '的', '名义', ' ']

In [6]:
document_sent = ' '.join(document_cut)
document_sent[:100]

'\ufeff   \n   \n   人民 的 名义   \n   周梅森   \n   \n   \n   \u3000 © 中文 在线 数字 出版 集团股份 有限公司 ， 2016 - 2017   \n   \u3000 数字 版图 书 '

In [11]:
with open('./data/in_the_name_of_people_segment.txt', 'w') as f2:
        f2.write(document_sent)

#### 训练模型

在gensim中，word2vec 相关的API都在包gensim.models.word2vec中。和算法有关的参数都在类gensim.models.word2vec.Word2Vec中。

相关参数的说明：<br>
sentences: 语料，可以是一个列表，或者从文件中遍历读出。 <br>
size: 词向量的维度，默认100  <br>
window: 词向量上下文最大距离，默认值为5， 对于一般的语料这个值推荐在5~10之间。 <br>
sg: word2vec两个模型的选择。如果是0， 则是CBOW模型，是1则是Skip-Gram模型，默认是0即CBOW模型。 <br>
hs: word2vec两个解法的选择。如果是0， 则是Negative Sampling，是1的话并且负采样个数negative大于0， 则是Hierarchical Softmax。默认是0即Negative Sampling。<br>
negative: Negative Sampling时负采样的个数，默认是5。推荐在3~10。<br>
cbow_mean: 仅用于CBOW在做投影的时候，若为0用上下文的词向量之和，若为1为上下文的词向量的平均值。<br>
min_count: 需要计算词向量的最小词频。这个值可以去掉一些很生僻的低频词，默认是5。如果是小语料，可以调低这个值。<br>
iter: 随机梯度下降法中迭代的最大次数，默认是5。对于大语料，可以增大这个值。<br>
alpha: 在随机梯度下降法中迭代的初始步长。<br>
min_alpha: 由于算法支持在迭代的过程中逐渐减小步长，min_alpha给出了最小的迭代步长值。对于大语料，需要对alpha, min_alpha,iter一起调参，来选择合适的三个值。<br>
worker：线程数，默认1

In [12]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = models.word2vec.LineSentence('./data/in_the_name_of_people_segment.txt')   # 载入数据
model = models.word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3,size=100)

2019-07-23 18:03:36,964 : INFO : collecting all words and their counts
2019-07-23 18:03:36,966 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-07-23 18:03:37,026 : INFO : collected 17878 word types from a corpus of 161343 raw words and 2311 sentences
2019-07-23 18:03:37,027 : INFO : Loading a fresh vocabulary
2019-07-23 18:03:37,068 : INFO : effective_min_count=1 retains 17878 unique words (100% of original 17878, drops 0)
2019-07-23 18:03:37,069 : INFO : effective_min_count=1 leaves 161343 word corpus (100% of original 161343, drops 0)
2019-07-23 18:03:37,122 : INFO : deleting the raw counts dictionary of 17878 items
2019-07-23 18:03:37,125 : INFO : sample=0.001 downsamples 38 most-common words
2019-07-23 18:03:37,126 : INFO : downsampling leaves estimated 120578 word corpus (74.7% of prior 161343)
2019-07-23 18:03:37,137 : INFO : constructing a huffman tree from 17878 words
2019-07-23 18:03:37,533 : INFO : built huffman tree with maximum node depth 17


#### 模型的应用

##### 近似词

In [15]:
model.wv.similar_by_word('沙瑞金', topn =5)

2019-07-24 09:17:14,161 : INFO : precomputing L2-norms of word weight vectors


[('高育良', 0.9673596620559692),
 ('田国富', 0.9436954855918884),
 ('易学习', 0.9399048089981079),
 ('李达康', 0.9391453266143799),
 ('咱', 0.9371181726455688)]

In [16]:
req_count = 5
for key in model.wv.similar_by_word('沙瑞金', topn =100):
    if len(key[0])==3:
        req_count -= 1
        print(key[0], key[1])
        if req_count == 0:
            break

高育良 0.9673596620559692
田国富 0.9436954855918884
易学习 0.9399048089981079
李达康 0.9391453266143799
陆亦可 0.9217689037322998


##### 两个词的相似度

In [17]:
print(model.wv.similarity('沙瑞金','高育良'))

0.9673596


##### 找不不同类的词

In [18]:
print(model.wv.doesnt_match(u"沙瑞金 高育良 李达康 刘庆祝".split()))

刘庆祝


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


##### 最相似的词

In [20]:
model.wv.most_similar('沙瑞金')[0]

('高育良', 0.9673596620559692)

##### 集合间的余弦相似度

In [35]:
# 注意：当出现某个词语不在这个训练集合中的时候，会报错！
list1 = ['沙瑞金','今天','政府'] 
list2 = ['高育良', '举报'] 
list_sim1 = model.wv.n_similarity(list1, list2)
print(list_sim1)

0.97881687


#### 模型保存和载入

In [22]:
# 方法一：直接save，保存的文件不能利用文本编辑器查看但是保存了训练的全部信息，可以在读取后追加训练
model.save('./word2vec_by_gensim')

2019-07-24 09:30:05,068 : INFO : saving Word2Vec object under ./word2vec_by_gensim, separately None
2019-07-24 09:30:05,069 : INFO : not storing attribute vectors_norm
2019-07-24 09:30:05,071 : INFO : not storing attribute cum_table
2019-07-24 09:30:05,484 : INFO : saved ./word2vec_by_gensim


In [23]:
# 重新载入模型
model2 = models.Word2Vec.load('./word2vec_by_gensim')
model2

2019-07-24 09:30:18,998 : INFO : loading Word2Vec object from ./word2vec_by_gensim
2019-07-24 09:30:19,273 : INFO : loading wv recursively from ./word2vec_by_gensim.wv.* with mmap=None
2019-07-24 09:30:19,274 : INFO : setting ignored attribute vectors_norm to None
2019-07-24 09:30:19,275 : INFO : loading vocabulary recursively from ./word2vec_by_gensim.vocabulary.* with mmap=None
2019-07-24 09:30:19,276 : INFO : loading trainables recursively from ./word2vec_by_gensim.trainables.* with mmap=None
2019-07-24 09:30:19,277 : INFO : setting ignored attribute cum_table to None
2019-07-24 09:30:19,278 : INFO : loaded ./word2vec_by_gensim


<gensim.models.word2vec.Word2Vec at 0x1a260d4f98>

In [None]:
# 追加训练
# model2.train(more_sentences)

In [None]:
# 方法二：使用save_word2vec_format，可以设定用binary=True/Fasle存为二进制或纯文本
# 方法保存为word2vec文本格式但是保存时丢失了词汇树等部分信息，不能追加训练
# model.save_word2vec_format('/tmp/mymodel.txt',binary = False)
# model.save_word2vec_format('/tmp/mymodel.bin.gz',binary = True)