In [3]:
#  使用自定义的文档文本，通过fasttext训练word2vec训练词向量模型，并计算词汇间的相关度。
# （尝试tensorboard绘制词向量可视化图）

In [4]:
# 导包
import jieba
import fasttext
import os.path as os
import torch
from torch.utils.tensorboard import SummaryWriter

In [5]:
def get_comments_file(oldFile, newFile):
    # 文档分词器
    tokenizer = lambda x: jieba.lcut(x)

    # 加载原文档
    with open(oldFile, 'r', encoding='utf-8') as f:
        text = f.read()
    # 分词
    words = tokenizer(text)
    # 保存分词后的文档
    with open(newFile, 'w', encoding='utf-8') as f:
        for word in words:
            f.write(word + ' ')

In [6]:
# 1、分词
oldFile = "红楼梦.txt"
newFile = "hongloumeng.txt"
if not os.exists(newFile):
    get_comments_file(oldFile, newFile)

# 2、训练模型
model = fasttext.train_unsupervised(newFile, epoch=20, dim=300)

# 3、计算相似度
print(f"黛玉的近似词：{model.get_nearest_neighbors('黛玉')}")
print(f"宝玉的近似词：{model.get_nearest_neighbors('宝玉')}")
print(f"宝钗的近似词：{model.get_nearest_neighbors('宝钗')}")

黛玉的近似词：[(0.4779816269874573, '黛玉来'), (0.4568304121494293, '和黛玉'), (0.43497174978256226, '见黛玉'), (0.42635077238082886, '黛玉见'), (0.4196794629096985, '宝钗'), (0.4122893512248993, '黛玉忙'), (0.41128337383270264, '宝玉'), (0.4035329222679138, '黛玉因'), (0.40101882815361023, '宝钗来'), (0.40032148361206055, '向黛玉')]
宝玉的近似词：[(0.5280993580818176, '。'), (0.5150509476661682, '，'), (0.4845907986164093, '了'), (0.46670985221862793, '”'), (0.46418628096580505, '袭人'), (0.45743152499198914, '笑'), (0.4502829313278198, '你'), (0.43950289487838745, '：'), (0.43924883008003235, '说'), (0.4354904592037201, '“')]
宝钗的近似词：[(0.5118346810340881, '宝钗方'), (0.5078913569450378, '宝钗来'), (0.4707845151424408, '宝钗心'), (0.46038201451301575, '宝钗正'), (0.4363897442817688, '宝钗姊妹'), (0.42999184131622314, '宝钗见'), (0.42250800132751465, '宝钗忙'), (0.41997650265693665, '见宝钗'), (0.4196794033050537, '黛玉'), (0.4190636873245239, '宝钗进')]


In [7]:
# 词向量可视化
writer = SummaryWriter()
meta_data = model.words
embeddings = []
for word in meta_data:
    embeddings.append(model.get_word_vector(word))

writer.add_embedding(torch.tensor(embeddings), metadata=meta_data)

  writer.add_embedding(torch.tensor(embeddings), metadata=meta_data)
