In [1]:
#词向量
import pandas as pd
import jieba
from gensim.models.word2vec import Word2Vec

In [2]:
# 读入训练集文件
data = pd.read_csv('train.csv')
# 转字符串数组
corpus = data['comment'].values.astype(str)
# 分词，再重组为字符串数组
corpus = [jieba.lcut(corpus[index]
                          .replace("，", "")
                          .replace("!", "")
                          .replace("！", "")
                          .replace("。", "")
                          .replace("~", "")
                          .replace("；", "")
                          .replace("？", "")
                          .replace("?", "")
                          .replace("【", "")
                          .replace("】", "")
                          .replace("#", "")
                        ) for index in range(len(corpus))]
# 词向量模型训练
model = Word2Vec(corpus, sg=0, vector_size=300, window=5, min_count=3, workers=4)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\DELL\AppData\Local\Temp\jieba.cache
Loading model cost 0.612 seconds.
Prefix dict has been built successfully.


In [3]:
#模型显示
print('模型参数：',model,'\n')

模型参数： Word2Vec<vocab=4036, vector_size=300, alpha=0.025> 



In [4]:
#最匹配
print('最匹配的词是：',model.wv.most_similar(positive=['点赞', '不错'], negative=['难吃']),'\n')

最匹配的词是： [('位置', 0.9433055520057678), ('推荐', 0.9426226019859314), ('高', 0.937842845916748), ('值得', 0.9336845278739929), ('挺', 0.933626651763916), ('好找', 0.9324604868888855), ('好量', 0.9265896081924438), ('饭店', 0.9264900088310242), ('太撑', 0.9251110553741455), ('足下', 0.9245920181274414)] 



In [6]:
#最不匹配
print('最不匹配的词是：',model.wv.doesnt_match("点赞 好吃 支持 难吃".split()),'\n')

最不匹配的词是： 难吃 



In [7]:
#语义相似度
print('相似度为=',model.wv.similarity('推荐','好吃'),'\n')

相似度为= 0.7473329 



In [8]:
#坐标返回
print(model.wv.__getitem__('地道'))

[ 0.01639573  0.1423366   0.00484476  0.06311934 -0.05236612 -0.0744183
  0.0964752   0.2781693   0.01370335 -0.04073065 -0.0077712  -0.1268021
 -0.02371065 -0.00526464 -0.13130118 -0.05384277  0.10931876  0.00820958
  0.04908003 -0.05276758 -0.0570082  -0.024044    0.02530711  0.029896
  0.08163864 -0.0269992  -0.16081272  0.04238123 -0.05263458 -0.11677747
  0.09745053 -0.05579699  0.016636   -0.01702799 -0.07922065  0.02912837
  0.08799836 -0.14476739  0.03795531  0.03725912 -0.07166268  0.03401773
  0.03076508 -0.12101959  0.07795402  0.11617716  0.05323258  0.00661292
 -0.01237189  0.09156105  0.02553621 -0.01437179 -0.0458756   0.04771264
 -0.03732842  0.11147879  0.04967964  0.00485415  0.02720474  0.00760455
 -0.0615216  -0.04450239  0.00947841  0.06204944 -0.02145944  0.06796355
 -0.00853082  0.06972022 -0.0945304  -0.06714966  0.00634782  0.06057703
  0.1188779  -0.11897719  0.03672024  0.04527516 -0.08049383  0.00953019
 -0.05021407  0.06924316 -0.0989574  -0.12125184  0.032

In [10]:
# 重新训练 Skip-Gram 模型
model_sg = Word2Vec(corpus, sg=1, vector_size=300, window=5, min_count=3, workers=4)
print("模型参数:", model_sg)

模型参数: Word2Vec<vocab=4036, vector_size=300, alpha=0.025>


In [18]:
# 获取“环境”的词向量
env_vector = model_sg.wv["环境"]
print("词向量:", env_vector)

词向量: [ 1.36850864e-01  1.38914481e-01 -1.57420486e-02  1.85716853e-01
 -9.44452062e-02 -1.06157232e-02  9.77250934e-03  4.44461793e-01
 -2.61064798e-01 -1.25700101e-01  4.98755015e-02 -2.53804654e-01
 -2.98335459e-02  3.86836077e-03 -1.69614419e-01  1.00009277e-01
  4.44056302e-01  8.59805569e-02  2.34806672e-01 -3.38534921e-01
 -1.30110756e-01 -1.67100132e-01 -5.94661236e-02 -8.49702805e-02
  1.77422464e-02  9.24162716e-02 -1.89725515e-02  2.05799509e-02
  7.14596882e-02 -1.35169670e-01  2.16653049e-01  2.15221215e-02
  7.38002211e-02  2.02548027e-01 -2.24760607e-01  2.14141645e-02
  5.09568416e-02 -2.08418936e-01 -2.93234538e-04 -5.55077195e-02
  1.69621021e-01 -9.33158845e-02  3.12664837e-01  7.87834600e-02
  4.89683971e-02  2.37023145e-01 -1.12854755e-02 -2.33773887e-01
  3.67305391e-02  1.39474496e-01  4.95625027e-02  5.28468601e-02
 -1.76031753e-01  1.76909104e-01 -4.23732437e-02  1.28770038e-01
 -9.61274579e-02 -1.73050329e-01  5.99052273e-02 -1.68178022e-01
 -8.71756077e-02  5.

In [15]:
print("形状:", env_vector.shape)

形状: (300,)


In [12]:
# 查找最接近的3个词
similar_words = model_sg.wv.most_similar("好吃", topn=3)
print("最接近的词:", similar_words)

最接近的词: [('入味', 0.8370994329452515), ('棒', 0.8257178664207458), ('正', 0.825619101524353)]


In [17]:
# 计算相似度
sim1 = model_sg.wv.similarity("好吃", "美味")
sim2 = model_sg.wv.similarity("好吃", "蟑螂")
print("好吃 vs 美味:", sim1)

好吃 vs 美味: 0.79342425


In [16]:
print("好吃 vs 蟑螂:", sim2)

好吃 vs 蟑螂: 0.29344454


In [14]:
# 向量类比计算
result = model_sg.wv.most_similar(positive=["餐厅", "聚会"], negative=["安静"], topn=1)
print("餐厅 + 聚会 - 安静 =", result[0][0])

餐厅 + 聚会 - 安静 = 亲人
