In [2]:
#词向量
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec

# 读入训练集文件
data = pd.read_csv('train.csv')
# 转字符串数组
corpus = data['comment'].values.astype(str)
# 分词，再重组为字符串数组
corpus = [jieba.lcut(corpus[index]
                          .replace("，", "")
                          .replace("!", "")
                          .replace("！", "")
                          .replace("。", "")
                          .replace("~", "")
                          .replace("；", "")
                          .replace("？", "")
                          .replace("?", "")
                          .replace("【", "")
                          .replace("】", "")
                          .replace("#", "")
                        ) for index in range(len(corpus))]
# 词向量模型训练
model = Word2Vec(corpus, sg=0, vector_size=300, window=5, min_count=3, workers=4)
#模型显示
print('模型参数：',model,'\n')

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\ASUS\AppData\Local\Temp\jieba.cache
Loading model cost 0.616 seconds.
Prefix dict has been built successfully.


模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025> 



In [3]:
#最匹配
print('最匹配的词是：',model.wv.most_similar(positive=['点赞', '不错'], negative=['难吃']),'\n')

最匹配的词是： [('无可挑剔', 0.9501499533653259), ('小弟', 0.9250810742378235), ('好找', 0.9245438575744629), ('位置', 0.9227657914161682), ('高', 0.9213833808898926), ('值得', 0.9210136532783508), ('推荐', 0.9201816320419312), ('不太多', 0.920057475566864), ('丰富', 0.919941246509552), ('团购', 0.9184621572494507)] 



In [4]:
#语义相似度
print('相似度为=',model.wv.similarity('推荐','好吃'),'\n')

相似度为= 0.79698277 



In [5]:
#坐标返回
print(model.wv.__getitem__('地道'))

[ 0.01515754  0.12122083  0.02125829  0.06513201 -0.05409003 -0.07920684
  0.09245599  0.2838612   0.00596973 -0.04420139 -0.00893647 -0.11640155
 -0.04183127 -0.013441   -0.12929633 -0.04604026  0.11381388  0.01194128
  0.0580782  -0.06746592 -0.05784685 -0.01581733  0.00994955  0.03357759
  0.08334368 -0.03370211 -0.16457234  0.04987547 -0.03284347 -0.10280792
  0.09617968 -0.04506683  0.01032164 -0.00339418 -0.07639625  0.01526439
  0.08088066 -0.15389821  0.05236846  0.0371066  -0.06480203  0.0306374
  0.04744416 -0.1123687   0.07131327  0.10418451  0.04830883 -0.00558341
 -0.00157815  0.09564119  0.03341889 -0.02476439 -0.03741483  0.03946341
 -0.02811478  0.10681877  0.04779937  0.00955203  0.03504727  0.01948178
 -0.05023565 -0.03612199  0.00670813  0.06092405 -0.03826466  0.07093644
  0.00426834  0.06559232 -0.10468188 -0.05464513 -0.00101409  0.0404491
  0.11337322 -0.13016574  0.04898093  0.04541457 -0.08166546  0.01641415
 -0.0658983   0.05721446 -0.08911959 -0.11743167  0.0

In [6]:
# 使用Skip-Gram训练Word2Vec模型 (sg=1表示Skip-Gram)
sg_model = Word2Vec(corpus, sg=1, vector_size=300, window=5, min_count=3, workers=4)
print('Skip-Gram模型参数：', sg_model)

Skip-Gram模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025>


In [7]:
# 获取"环境"的词向量
env_vector = sg_model.wv['环境']
print('"环境"的词向量：\n', env_vector)
print('词向量形状：', env_vector.shape)


"环境"的词向量：
 [ 0.13340367  0.16936424  0.07651863  0.10819194 -0.09856046 -0.05432662
 -0.07375523  0.4024046  -0.20093371 -0.13338636 -0.05668922 -0.2189701
  0.01582613  0.02897725 -0.16035953  0.05236843  0.2910225  -0.03649396
  0.14283232 -0.38659957 -0.13003832 -0.07331298 -0.14828515 -0.04436906
 -0.09088524  0.09658511 -0.01995522  0.13304913 -0.00464731 -0.01588379
  0.21853267 -0.01250989  0.0683793   0.12507118 -0.20488146  0.01966229
  0.01001965 -0.17003694  0.05728401 -0.06954454  0.0393966  -0.01141219
  0.30830133  0.06105363  0.04270295  0.23049471  0.11580871 -0.17194985
  0.05795365  0.10008541  0.09714536  0.00343734 -0.14413324  0.12554495
  0.13426574  0.08147489 -0.2539026  -0.1822817   0.04460553 -0.19042422
 -0.06653395  0.04307622 -0.03006247  0.01372835 -0.1617231   0.13635314
  0.11328761 -0.04806114 -0.26303807  0.04107632 -0.02724481 -0.1828341
 -0.00955008 -0.19835608  0.23683177  0.14862718 -0.05336776 -0.03125753
 -0.14533237  0.11015563 -0.13976692 -0.22

In [8]:
# 找出与"好吃"最相似的3个词
similar_words = sg_model.wv.most_similar('好吃', topn=3)
print('与"好吃"最相似的3个词：')
for word, similarity in similar_words:
    print(f'{word}: {similarity:.4f}')

与"好吃"最相似的3个词：
入味: 0.8407
棒: 0.8405
好看: 0.8389


In [9]:
# 计算词语相似度
print('"好吃"和"美味"的相似度:', sg_model.wv.similarity('好吃', '美味'))
print('"好吃"和"蟑螂"的相似度:', sg_model.wv.similarity('好吃', '蟑螂'))

"好吃"和"美味"的相似度: 0.8100944
"好吃"和"蟑螂"的相似度: 0.29636574


In [10]:
# 向量运算"餐厅+聚会-安静=？"
result = sg_model.wv.most_similar(positive=['餐厅', '聚会'], negative=['安静'], topn=1)
print('\n向量运算"餐厅+聚会-安静="最相关结果:', result[0][0])


向量运算"餐厅+聚会-安静="最相关结果: 家庭聚会
