In [1]:
# import modules & set up logging
import logging
import os

import numpy as np

import gensim
from gensim.models import word2vec

import jieba.analyse
import jieba

In [2]:
# set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
sentence_file_path = './datas/in_the_name_of_people.txt'
word_file_path = './datas/cut_chars_of_in_the_name_of_people.txt'
model_file_path1 = './datas/gensim_char2vec1.w2v'
model_file_path2 = './datas/gensim_char2vec2.bin'
model_file_path3 = './datas/gensim_char2vec3_{}.npy'

## 一、分词

In [4]:
# 人民的名义 小说分词
jieba.suggest_freq('沙瑞金',True)
jieba.suggest_freq('田国富',True)
jieba.suggest_freq('高育良',True)
jieba.suggest_freq('侯亮平',True)
jieba.suggest_freq('钟小艾', True)
jieba.suggest_freq('陈岩石', True)
jieba.suggest_freq('欧阳菁', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孙连城', True)
jieba.suggest_freq('季昌明', True)
jieba.suggest_freq('丁义珍', True)
jieba.suggest_freq('郑西坡', True)
jieba.suggest_freq('赵东来', True)
jieba.suggest_freq('高小琴', True)
jieba.suggest_freq('赵瑞龙', True)
jieba.suggest_freq('林华华', True)
jieba.suggest_freq('陆亦可', True)
jieba.suggest_freq('刘新建', True)
jieba.suggest_freq('刘庆祝', True)
jieba.suggest_freq('京州市', True)
jieba.suggest_freq('副市长', True)
jieba.suggest_freq('赵德汉',True)

with open(word_file_path,'w', encoding='utf-8') as writer:
    with open(sentence_file_path, 'r', encoding='utf-8') as reader:
        # 加载所有数据
        content = reader.read()
        
        # 分词 --> 以每个字作为独立的词
        content = list(content)
        
        # 合并结果
        result = ' '.join(content)
        
        # 结果输出
        writer.write(result)
print("Done!!!")

Building prefix dict from the default dictionary ...
2023-09-15 22:25:57,557 : DEBUG : Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\HP\AppData\Local\Temp\jieba.cache
2023-09-15 22:25:57,569 : DEBUG : Loading model from cache C:\Users\HP\AppData\Local\Temp\jieba.cache
Loading model cost 2.334 seconds.
2023-09-15 22:25:59,902 : DEBUG : Loading model cost 2.334 seconds.
Prefix dict has been built successfully.
2023-09-15 22:25:59,905 : DEBUG : Prefix dict has been built successfully.


Done!!!


## 二、Gensim Word2Vec构建

#### 训练方式一

In [5]:
# 每行数据加载
print(f"文件路径:{word_file_path}")
sentences = word2vec.LineSentence(word_file_path) 

# 训练Word2Vec模型
"""
classgensim.models.word2vec.Word2Vec(
    sentences=None, corpus_file=None, 
    vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, 
    sample=0.001, seed=1, workers=3, min_alpha=0.0001, 
    sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, 
    hashfxn=<built-in function hash>, epochs=5, 
    null_word=0, trim_rule=None, sorted_vocab=1, 
    batch_words=10000, compute_loss=False, 
    callbacks=(), comment=None, max_final_vocab=None, shrink_windows=True)
sg: 1(Skip-gram) 0(CBOW)
hs: 1(hierarchical softmax) 0(negative)
negative: 当hs为0的时候，给定负样本数目，给定为0表示不采用负采样
"""
model = word2vec.Word2Vec(sentences, hs = 1,min_count = 1,window = 3,vector_size = 100)

2023-09-15 22:26:00,033 : INFO : collecting all words and their counts
2023-09-15 22:26:00,037 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


文件路径:./datas/cut_chars_of_in_the_name_of_people.txt


2023-09-15 22:26:00,290 : INFO : collected 3269 word types from a corpus of 258973 raw words and 2311 sentences
2023-09-15 22:26:00,291 : INFO : Creating a fresh vocabulary
2023-09-15 22:26:00,330 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 3269 unique words (100.00% of original 3269, drops 0)', 'datetime': '2023-09-15T22:26:00.330931', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'prepare_vocab'}
2023-09-15 22:26:00,332 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 258973 word corpus (100.00% of original 258973, drops 0)', 'datetime': '2023-09-15T22:26:00.332765', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'prepare_vocab'}
2023-09-15 22:26:00,459 : INFO : deleting the raw counts dictionary of 3269 items
2023-09-15 22:26:00,462 :

#### 训练方式二

In [6]:
# 每行数据加载
sentences = word2vec.LineSentence(word_file_path) 

# 训练Word2Vec模型
model = word2vec.Word2Vec(hs = 1,min_count = 1,window = 9,vector_size = 100)

# 构建词典
model.build_vocab(sentences)

# 模型训练
model.train(sentences, total_examples=model.corpus_count, epochs=5)

2023-09-15 22:26:05,272 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2023-09-15T22:26:05.272851', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'created'}
2023-09-15 22:26:05,286 : INFO : collecting all words and their counts
2023-09-15 22:26:05,294 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-09-15 22:26:05,536 : INFO : collected 3269 word types from a corpus of 258973 raw words and 2311 sentences
2023-09-15 22:26:05,539 : INFO : Creating a fresh vocabulary
2023-09-15 22:26:05,588 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 3269 unique words (100.00% of original 3269, drops 0)', 'datetime': '2023-09-15T22:26:05.588623', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'eve

(1018845, 1294865)

## 三、Word2Vec应用

### 0. 获取Word2Vec模型相关属性

In [7]:
print("【词汇数目】: {}".format(len(model.wv.key_to_index)))
print("【转换的稠密的特征向量维度数目,每个单词转换的向量维度大小】: {}".format(model.wv.vector_size))
# print("【单词到id的映射关系】: \n{}".format(model.wv.key_to_index))

【词汇数目】: 3269
【转换的稠密的特征向量维度数目,每个单词转换的向量维度大小】: 100


### 1. 获取相似度最高的K个演员

In [8]:
# 夹角余弦相似度
req_count = 5
for key in model.wv.similar_by_word('沙', topn =100):
    req_count -= 1
    print(key[0], key[1])
    if req_count == 0:
        break;

龙 0.6806393265724182
音 0.5905642509460449
棺 0.5317568182945251
讯 0.5231453776359558
赞 0.4964260756969452


### 2. 获取单词之间的相似度

In [9]:
# 夹角余弦相似度
print(model.wv.similarity('沙', '瑞'))

-0.1568283


### 3. 获取单词的词向量

In [10]:
v1 = model.wv.get_vector("提")
print(v1.shape)
print(v1)

(100,)
[-0.1412568  -0.18312506  0.28488874 -0.19689868 -0.04151535 -0.12202216
 -0.39421114  0.57379025 -0.03881591 -0.47904474 -0.5288144  -0.45011202
  1.0888281   0.6845412   0.36703792 -0.26092514 -0.63417363 -0.87472093
 -0.2707515  -0.47596595 -0.22023484  0.22903512 -0.39029452  0.12058085
  0.21551585 -0.87997866  0.30429983 -0.51313174  0.1837985   0.49339062
 -0.36594316 -1.0341128   0.42795333  0.7066203  -0.44232622 -0.46763116
  0.18078515  0.72454214  0.4951955   0.54927534  2.058531    0.17165852
  0.07457943  0.7987704  -0.3157886  -0.41275406 -0.3614257   0.2369887
  0.15158649 -0.3007517   0.399728    1.3008378   0.7315833  -0.5965188
 -0.727052    0.44002172 -0.04997028  1.3276641   0.7014737  -0.2800626
  0.50441384  0.47367907  0.21706422  0.53943616 -0.60093004 -0.13698621
 -0.5334136   0.39392334  0.6514524  -0.9085511  -0.32391208 -0.5555952
 -0.11478419  0.8660509  -0.14987792 -0.3460935   0.92810845  0.3718172
 -0.10933036  0.1174156   0.5322972  -0.5126592  

In [11]:
model.wv['提']

array([-0.1412568 , -0.18312506,  0.28488874, -0.19689868, -0.04151535,
       -0.12202216, -0.39421114,  0.57379025, -0.03881591, -0.47904474,
       -0.5288144 , -0.45011202,  1.0888281 ,  0.6845412 ,  0.36703792,
       -0.26092514, -0.63417363, -0.87472093, -0.2707515 , -0.47596595,
       -0.22023484,  0.22903512, -0.39029452,  0.12058085,  0.21551585,
       -0.87997866,  0.30429983, -0.51313174,  0.1837985 ,  0.49339062,
       -0.36594316, -1.0341128 ,  0.42795333,  0.7066203 , -0.44232622,
       -0.46763116,  0.18078515,  0.72454214,  0.4951955 ,  0.54927534,
        2.058531  ,  0.17165852,  0.07457943,  0.7987704 , -0.3157886 ,
       -0.41275406, -0.3614257 ,  0.2369887 ,  0.15158649, -0.3007517 ,
        0.399728  ,  1.3008378 ,  0.7315833 , -0.5965188 , -0.727052  ,
        0.44002172, -0.04997028,  1.3276641 ,  0.7014737 , -0.2800626 ,
        0.50441384,  0.47367907,  0.21706422,  0.53943616, -0.60093004,
       -0.13698621, -0.5334136 ,  0.39392334,  0.6514524 , -0.90

In [12]:
# 异常：不存在"小明"这个单词
# model.wv.get_vector("小明")

In [13]:
# 首先判断是否存在单词，如果存在，就返回，否则单词直接过滤
word = "明"
# word = "康"
if word in model.wv:
    print("【向量】:\n{}".format(model.wv[word]))
else:
    print("【单词不存在】!!!")

【向量】:
[-1.6461818  -1.0586804  -0.05233782  1.0272888   0.7208985   0.6484954
  2.0570333   0.23740542  1.7734475  -0.4766108   2.0938504   3.5287266
 -0.6909583  -1.6923838  -2.118905   -1.47622     2.1244147   0.24693976
  0.3159762  -1.0810101   1.6064044  -1.7139945   0.623651   -0.60801345
  1.9721653   0.5068213   1.3088527  -1.939875   -3.0105808   0.6512234
 -0.0119812   2.3553255  -1.4937325  -1.4910252  -0.54609555  0.08085161
 -1.518993   -3.2818222  -2.1248455   2.0163708  -1.8022631   0.42989305
  2.5546057   1.7766382   4.0128226  -2.3182504  -0.7139472  -1.8510693
  0.43365258 -0.71715367  1.9800317  -0.42742136  0.18735449  0.4497837
 -1.4427707  -2.838277    1.9472058   0.33534107 -1.0588503  -1.1663908
 -0.16970505 -0.74356574 -0.05802837 -3.4141722  -2.341562    0.51540446
 -0.22367801  1.1214468   0.38174674 -1.8347814  -0.8771254   0.4472925
 -1.5586258  -0.7575157   0.42089006 -0.0642156  -1.079893   -1.1801511
 -2.0549808   0.04978392 -0.42684153  1.0748101   0.6

## 四、模型持久化&模型恢复加载

### 方式一：
直接使用save API进行模型持久化

#### 持久化

In [14]:
model.save(model_file_path1)

2023-09-15 22:26:11,682 : INFO : Word2Vec lifecycle event {'fname_or_handle': './datas/gensim_char2vec1.w2v', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-09-15T22:26:11.682231', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'saving'}
2023-09-15 22:26:11,684 : INFO : not storing attribute cum_table
2023-09-15 22:26:11,758 : INFO : saved ./datas/gensim_char2vec1.w2v


#### 加载

In [15]:
# 直接基于路径加载
model2 = word2vec.Word2Vec.load(model_file_path1)
print(model2)

v1 = model2.wv.get_vector("提")
print(v1.shape)
print(v1)

2023-09-15 22:26:11,773 : INFO : loading Word2Vec object from ./datas/gensim_char2vec1.w2v
2023-09-15 22:26:11,805 : INFO : loading wv recursively from ./datas/gensim_char2vec1.w2v.wv.* with mmap=None
2023-09-15 22:26:11,808 : INFO : setting ignored attribute cum_table to None
2023-09-15 22:26:11,882 : INFO : Word2Vec lifecycle event {'fname': './datas/gensim_char2vec1.w2v', 'datetime': '2023-09-15T22:26:11.882003', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'loaded'}


Word2Vec<vocab=3269, vector_size=100, alpha=0.025>
(100,)
[-0.1412568  -0.18312506  0.28488874 -0.19689868 -0.04151535 -0.12202216
 -0.39421114  0.57379025 -0.03881591 -0.47904474 -0.5288144  -0.45011202
  1.0888281   0.6845412   0.36703792 -0.26092514 -0.63417363 -0.87472093
 -0.2707515  -0.47596595 -0.22023484  0.22903512 -0.39029452  0.12058085
  0.21551585 -0.87997866  0.30429983 -0.51313174  0.1837985   0.49339062
 -0.36594316 -1.0341128   0.42795333  0.7066203  -0.44232622 -0.46763116
  0.18078515  0.72454214  0.4951955   0.54927534  2.058531    0.17165852
  0.07457943  0.7987704  -0.3157886  -0.41275406 -0.3614257   0.2369887
  0.15158649 -0.3007517   0.399728    1.3008378   0.7315833  -0.5965188
 -0.727052    0.44002172 -0.04997028  1.3276641   0.7014737  -0.2800626
  0.50441384  0.47367907  0.21706422  0.53943616 -0.60093004 -0.13698621
 -0.5334136   0.39392334  0.6514524  -0.9085511  -0.32391208 -0.5555952
 -0.11478419  0.8660509  -0.14987792 -0.3460935   0.92810845  0.371817

### 方式二：
保存为二进制词向量

#### 持久化

In [16]:
model.wv.save_word2vec_format(model_file_path2,binary=True)

2023-09-15 22:26:11,913 : INFO : storing 3269x100 projection weights into ./datas/gensim_char2vec2.bin


#### 加载

In [17]:
# 加载模型
model2 = gensim.models.KeyedVectors.load_word2vec_format(model_file_path2,binary=True)
print(model2)

# 应用模型
v1 = model2.get_vector("提")
print(v1.shape)
print(v1)

2023-09-15 22:26:11,986 : INFO : loading projection weights from ./datas/gensim_char2vec2.bin
2023-09-15 22:26:12,102 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3269, 100) matrix of type float32 from ./datas/gensim_char2vec2.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-09-15T22:26:12.102288', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'load_word2vec_format'}


KeyedVectors<vector_size=100, 3269 keys>
(100,)
[-0.1412568  -0.18312506  0.28488874 -0.19689868 -0.04151535 -0.12202216
 -0.39421114  0.57379025 -0.03881591 -0.47904474 -0.5288144  -0.45011202
  1.0888281   0.6845412   0.36703792 -0.26092514 -0.63417363 -0.87472093
 -0.2707515  -0.47596595 -0.22023484  0.22903512 -0.39029452  0.12058085
  0.21551585 -0.87997866  0.30429983 -0.51313174  0.1837985   0.49339062
 -0.36594316 -1.0341128   0.42795333  0.7066203  -0.44232622 -0.46763116
  0.18078515  0.72454214  0.4951955   0.54927534  2.058531    0.17165852
  0.07457943  0.7987704  -0.3157886  -0.41275406 -0.3614257   0.2369887
  0.15158649 -0.3007517   0.399728    1.3008378   0.7315833  -0.5965188
 -0.727052    0.44002172 -0.04997028  1.3276641   0.7014737  -0.2800626
  0.50441384  0.47367907  0.21706422  0.53943616 -0.60093004 -0.13698621
 -0.5334136   0.39392334  0.6514524  -0.9085511  -0.32391208 -0.5555952
 -0.11478419  0.8660509  -0.14987792 -0.3460935   0.92810845  0.3718172
 -0.1093

In [18]:
# 加载模型
model2 = gensim.models.KeyedVectors.load_word2vec_format('./datas/vectors.bin',
                                                         binary=True)
print(model2)

# 应用模型
v1 = model2.get_vector("酒")
print(v1.shape)
print(v1)

2023-09-15 22:26:12,130 : INFO : loading projection weights from ./datas/vectors.bin
2023-09-15 22:26:12,391 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (7942, 128) matrix of type float32 from ./datas/vectors.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-09-15T22:26:12.391249', 'gensim': '4.2.0', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'load_word2vec_format'}


KeyedVectors<vector_size=128, 7942 keys>
(128,)
[ 3.50566626e-01 -1.04986653e-01 -7.67363831e-02  1.02968253e-01
  1.18330494e-01  5.92405088e-02  1.43829891e-02 -2.13755772e-01
  3.01811416e-02  6.38461784e-02  9.29202810e-02 -9.80767310e-02
  3.37989390e-01  1.62496209e-01 -1.00853711e-01  1.86467111e-01
  1.23709984e-01  4.02765274e-02  1.66892633e-01 -1.33713201e-01
  1.33725271e-01 -7.69479200e-02 -4.04792249e-01  1.34307100e-02
 -4.08627130e-02  1.60763144e-01 -2.02138210e-03 -2.05629498e-01
  1.40110895e-01  1.38469696e-01  5.83793372e-02 -6.10959306e-02
  2.09262501e-02  2.13176370e-01 -6.33254573e-02 -1.41293630e-01
  8.06461945e-02  8.95849839e-02  8.80930126e-02 -9.46233943e-02
  5.21726757e-02 -2.96247043e-02 -4.45053317e-02 -1.41549101e-02
 -1.64876487e-02  5.51286805e-03 -1.92858249e-01  1.42684672e-03
  6.35003224e-02 -1.57157220e-02 -1.80648953e-01  1.67108551e-01
 -9.37882364e-02 -4.28168513e-02  4.31317948e-02  2.22156458e-02
 -2.61552483e-01 -2.04422385e-01 -1.230919

### 方式三：
直接使用NumPy API保存词向量信息

#### 持久化

In [19]:
# 获取词向量
norm_word_embeddings = model.wv.get_normed_vectors()
word_embeddings = model.wv.vectors
# 获取词典(词典到idx的映射)
vocab_2_index = list(map(lambda k: (k, model.wv.key_to_index[k]), model.wv.key_to_index))
print(np.shape(norm_word_embeddings), np.shape(word_embeddings), np.shape(vocab_2_index))
# 数据保存
np.save(model_file_path3.format("norm_embedding"), norm_word_embeddings)
np.save(model_file_path3.format("embedding"), word_embeddings)
np.save(model_file_path3.format("vocab_2_index"), vocab_2_index)

(3269, 100) (3269, 100) (3269, 2)


#### 加载

In [20]:
# 加载数据
norm_word_embeddings = np.load(model_file_path3.format("norm_embedding"))
word_embeddings = np.load(model_file_path3.format("embedding"))
vocab_2_index = np.load(model_file_path3.format("vocab_2_index"))

# 字典转换
vocab_2_index = dict(map(lambda t:(t[0], int(t[1])), vocab_2_index))

# 获取数据
word = "提"
index = vocab_2_index[word]
v1 = word_embeddings[index]
print(v1.shape)
print(v1)

(100,)
[-0.1412568  -0.18312506  0.28488874 -0.19689868 -0.04151535 -0.12202216
 -0.39421114  0.57379025 -0.03881591 -0.47904474 -0.5288144  -0.45011202
  1.0888281   0.6845412   0.36703792 -0.26092514 -0.63417363 -0.87472093
 -0.2707515  -0.47596595 -0.22023484  0.22903512 -0.39029452  0.12058085
  0.21551585 -0.87997866  0.30429983 -0.51313174  0.1837985   0.49339062
 -0.36594316 -1.0341128   0.42795333  0.7066203  -0.44232622 -0.46763116
  0.18078515  0.72454214  0.4951955   0.54927534  2.058531    0.17165852
  0.07457943  0.7987704  -0.3157886  -0.41275406 -0.3614257   0.2369887
  0.15158649 -0.3007517   0.399728    1.3008378   0.7315833  -0.5965188
 -0.727052    0.44002172 -0.04997028  1.3276641   0.7014737  -0.2800626
  0.50441384  0.47367907  0.21706422  0.53943616 -0.60093004 -0.13698621
 -0.5334136   0.39392334  0.6514524  -0.9085511  -0.32391208 -0.5555952
 -0.11478419  0.8660509  -0.14987792 -0.3460935   0.92810845  0.3718172
 -0.10933036  0.1174156   0.5322972  -0.5126592  