In [1]:
import gensim
import numpy as np
import pandas as pd
import jieba
import os

# 导入数据
folder_path = 'danmu_text'
# 导入 word to vector 模型
W2V_model_path = './W2V_model/model_from_zh.model'
W2V_model = gensim.models.Word2Vec.load(W2V_model_path)
# word to vector 的字典可以用来检测库中有没有对应的词向量
W2V_dic = W2V_model.wv.index_to_key
# dim 是词向量的维度
word_vec_dim = len(W2V_model.wv[W2V_dic[1]])

print("dim =", word_vec_dim)

dim = 70


In [2]:
print('人口' in W2V_dic)
print('傻瓜' in W2V_dic)
print('炸药' in W2V_dic)
print('西瓜刀' in W2V_dic)
# 下面这个输出就是 '人口' 对应的词向量
print(W2V_model.wv['人口'])


True
True
True
True
[ 0.1383596  -0.17468986  0.26738378  0.27664915  0.7140228   0.30336457
 -0.32623005  0.14275849  0.20718439 -1.3750342  -0.15256792  0.5628893
 -0.12664007 -1.671796   -0.41980577  0.16698277  0.8566149   1.6146973
  1.32098     0.518146   -0.6968868   1.0875909   0.35259685 -1.1078038
 -0.06966351  0.65688205 -0.34248763  0.8551295  -1.3682519  -0.6258048
  0.03248164  0.35811904 -0.02586001  1.4380642  -0.97641194  0.8970954
 -0.7824455  -0.24037059 -0.11944317 -0.8639166   0.3388526  -0.25791183
 -1.2384986   0.15366787 -0.5133913  -0.571598    0.5292653  -0.41370398
  0.01003916  0.42355835 -0.41143772 -1.1394283   0.79049635 -0.3699302
  0.63536614 -0.12875251 -0.21281405  0.32406306 -0.33010438  0.4743404
 -2.3396661  -0.04518685 -0.9380626  -0.86265945  0.21989642 -0.7452054
  0.27972785 -0.09746973 -0.47074002 -0.53538364]


In [3]:
# 限制一条弹幕的最大词数，过长会被截断
max_len = 15

splited_barrages = []

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)

        # 读取txt文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # 先把整个文件分成一条一条的弹幕
        barrages = content.split('\n')
        
        # 再用jieba.lcut把每条弹幕都分成词(str)组成的list
        for barr in barrages:
            splited_barrages.append(jieba.lcut(barr))

        print(splited_barrages)

input_data_len = len(splited_barrages)
print("data length =", input_data_len)
for i in range(input_data_len):
    # 限制词数
    splited_barrages[i] = splited_barrages[i][:max_len]
    input_data_i_len = len(splited_barrages[i])

    # 把一条弹幕的词逐个转化为词向量
    for j in range(input_data_i_len):
        # 找得到直接转
        if(splited_barrages[i][j] in W2V_dic):
            splited_barrages[i][j] = W2V_model.wv[splited_barrages[i][j]]
        # 找不到设为空
        elif(splited_barrages[i][j] not in W2V_dic):
            splited_barrages[i][j] = np.array(list(0 for k in range(word_vec_dim)))
        # 可以暂时忽略
        else:
            wv = W2V_model.wv[splited_barrages[i][j]]
            W2V_dic[splited_barrages[i][j]] = wv
            splited_barrages[i][j] = wv
    
    # 输出进度
    if(i % int(input_data_len / 10) == 0 and i != 0):
        print("\r" + str(i) + " sentences done", end = " ")

# 示例输出，这里可以看到这条弹幕的每个词都被转化成词向量(array)
print(splited_barrages[3])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\W\AppData\Local\Temp\jieba.cache
Loading model cost 0.888 seconds.
Prefix dict has been built successfully.


data length = 18408
18400 sentences done [array([ 0.03197099,  0.02076509,  0.04331821,  0.2895819 ,  0.07996122,
       -0.26565647,  0.46942583,  0.37751997,  0.07064417,  0.042102  ,
       -0.68054587,  0.9575885 ,  0.03065967,  0.079841  ,  0.30089092,
        0.56399053, -0.25683773, -0.28134635, -0.21052301,  0.05743065,
        0.18602918, -0.03401988,  1.0644392 , -0.45597133,  0.33868688,
        0.450915  , -0.05492989, -0.14464886,  0.00943762, -0.6850067 ,
        0.5171871 , -0.61662304, -0.7443479 ,  0.3323419 , -0.36739692,
        0.19481586, -0.5392843 , -0.3662116 ,  1.1013052 , -0.31878966,
        0.65755635, -0.5920154 ,  0.3570136 ,  0.1280854 ,  0.39543876,
       -0.38758707, -0.11152277, -0.4025489 , -0.44007415, -0.19346775,
       -0.5798335 , -0.56083345,  0.68451935, -0.07009499,  0.37999117,
        1.408985  ,  0.5672634 , -0.9305955 ,  1.0411708 ,  0.02497896,
       -0.52714616, -0.0410047 , -0.17762129,  0.15767409, -0.62192285,
        0.04068066,  0