In [1]:
import pandas as pd
from gensim.models import Word2Vec
import jieba

In [2]:
# 读取数据集
train = pd.read_csv('../../../Other/datasets/cnews/train.tsv', sep='\t', header=None, names=['label', 'content'])
val = pd.read_csv('../../../Other/datasets/cnews/dev.tsv', sep='\t', header=None, names=['label', 'content'])
test = pd.read_csv('../../../Other/datasets/cnews/test.tsv', sep='\t', header=None, names=['label', 'content'])

In [3]:
train.head()

Unnamed: 0,label,content
0,体育,马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 来到沈阳，国奥队依然没有...
1,体育,商瑞华首战复仇心切 中国玫瑰要用美国方式攻克瑞典多曼来了，瑞典来了，商瑞华首战求3分的信心也...
2,体育,冠军球队迎新欢乐派对 黄旭获大奖张军赢下PK赛新浪体育讯12月27日晚，“冠军高尔夫球队迎新...
3,体育,辽足签约危机引注册难关 高层威逼利诱合同笑里藏刀新浪体育讯2月24日，辽足爆发了集体拒签风波...
4,体育,揭秘谢亚龙被带走：总局电话骗局 复制南杨轨迹体坛周报特约记者张锐北京报道 谢亚龙已经被公安...


In [4]:
def sentence_cut(x):
    x = jieba.lcut(x)
    x = " ".join(x)
    return x

In [5]:
train['content'] = train['content'].map(lambda x: sentence_cut(x))
val['content'] = val['content'].map(lambda x: sentence_cut(x))
test['content'] = test['content'].map(lambda x: sentence_cut(x))
train.head()

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\duanm\AppData\Local\Temp\jieba.cache
Loading model cost 1.006 seconds.
Prefix dict has been built successfully.


Unnamed: 0,label,content
0,体育,马晓旭 意外 受伤 让 国奥 警惕 无奈 大雨 格外 青睐 殷家 军 记者 傅亚雨 沈阳...
1,体育,商瑞华 首战 复仇 心切 中国 玫瑰 要 用 美国 方式 攻克 瑞典 多曼来 了 ， 瑞...
2,体育,冠军 球队 迎新 欢乐 派对 黄旭获 大奖 张军 赢 下 PK 赛 新浪 体育讯 12 ...
3,体育,辽足 签约 危机 引 注册 难关 高层 威逼利诱 合同 笑里藏刀 新浪 体育讯 2 月 ...
4,体育,揭秘 谢亚龙 被 带走 ： 总局 电话 骗局 复制 南杨 轨迹 体坛周报 特约记者 张锐...


In [6]:
sentences_train = [document.split(' ') for document in train['content'].values]

### 训练word2vec

In [7]:
model = Word2Vec(sentences=sentences_train,
                 # Dimensionality of the word vectors.
                 vector_size=200,  # 默认:100
                 # Maximum distance between the current and predicted word within a sentence.
                 window=5,  # 默认:5
                 # Ignores all words with total frequency lower than this.
                 min_count=2,  # 默认5
                 # Training algorithm: 1 for skip-gram; otherwise CBOW.
                 sg=0,  # 默认:0
                 # If 1, hierarchical softmax will be used for model training.
                 # If 0, and `negative` is non-zero, negative sampling will be used.
                 hs=0,  # 默认0
                 # If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
                 # should be drawn (usually between 5-20).
                 # If set to 0, no negative sampling is used.
                 negative=5,  # 默认:5
                 # The initial learning rate.
                 alpha=0.025,  # 默认:0.025
                 # Learning rate will linearly drop to `min_alpha` as training progresses.
                 min_alpha=0.0001,  # 默认:0.0001,
                 # The threshold for configuring which higher-frequency words are randomly downsampled,
                 # useful range is (0, 1e-5).
                 sample=0.001,  # 默认:1e-3
                 # Number of iterations (epochs) over the corpus.
                 epochs=5,  # 默认:5
                 # If True, computes and stores loss value which can be retrieved using :meth: `~gensim.models.word2vec.Word2Vec.get_latest_training_loss`.
                 compute_loss=True,  # 默认:False
                 workers=8,
                 seed=2018
                 )

In [8]:
# Save the model. This saved model can be loaded again using load(), which supports online training and getting vectors for vocabulary words.
model.save('cnew_200')  # full model

In [9]:
# Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility.
model.wv.save_word2vec_format("cnew_200.txt")  # KeyedVectors(不支持进一步训练)

In [10]:
# 词表
model.wv.key_to_index

{'，': 0,
 '的': 1,
 '。': 2,
 '': 3,
 '在': 4,
 '、': 5,
 '了': 6,
 '是': 7,
 '“': 8,
 '”': 9,
 '和': 10,
 '：': 11,
 '也': 12,
 '有': 13,
 '》': 14,
 '《': 15,
 ')': 16,
 '(': 17,
 '我': 18,
 '他': 19,
 '都': 20,
 '\xa0': 21,
 '中': 22,
 '将': 23,
 '月': 24,
 '就': 25,
 '我们': 26,
 '对': 27,
 '年': 28,
 '为': 29,
 '上': 30,
 '一个': 31,
 '与': 32,
 '不': 33,
 '-': 34,
 '到': 35,
 '这': 36,
 '而': 37,
 '中国': 38,
 '会': 39,
 '但': 40,
 '说': 41,
 '人': 42,
 '可以': 43,
 '基金': 44,
 '日': 45,
 '等': 46,
 '你': 47,
 '要': 48,
 '很': 49,
 '还': 50,
 '？': 51,
 '没有': 52,
 '被': 53,
 '让': 54,
 '从': 55,
 '自己': 56,
 '后': 57,
 '并': 58,
 '他们': 59,
 '市场': 60,
 '这个': 61,
 '更': 62,
 '；': 63,
 '已经': 64,
 '以': 65,
 '时': 66,
 '表示': 67,
 '多': 68,
 '3': 69,
 '个': 70,
 '公司': 71,
 '美国': 72,
 '2': 73,
 '能': 74,
 '—': 75,
 '1': 76,
 '记者': 77,
 '学生': 78,
 '时间': 79,
 '游戏': 80,
 '分': 81,
 '就是': 82,
 '北京': 83,
 '可能': 84,
 '进行': 85,
 '地': 86,
 '投资': 87,
 '最': 88,
 '她': 89,
 '好': 90,
 '来': 91,
 '大': 92,
 '！': 93,
 '目前': 94,
 '5': 95,
 '新': 96,
 '问题': 97,
 '如

In [11]:
# get numpy vector of a word
model.wv['方案']

array([ 5.9322506e-01, -1.4075698e+00,  8.6693859e-01, -9.6600515e-01,
       -4.7815621e-01,  3.1963637e-01,  9.2254274e-02,  7.2466379e-01,
       -3.8719168e-01, -1.0834543e+00,  2.9158115e-01, -2.3868439e-01,
        7.5923443e-01,  1.0866537e-03,  2.7507706e+00, -1.1225307e+00,
        8.3004880e-01,  6.8823677e-01,  1.6812716e-01, -1.2630341e+00,
       -3.2953227e-01, -1.8835351e+00, -8.0532562e-03, -2.8475627e-02,
       -2.4536330e-01,  6.8327111e-01,  9.6777427e-01, -4.4113743e-01,
        3.1353557e+00,  2.5795727e+00, -1.0177273e+00,  2.3666617e-01,
       -7.9416895e-01,  1.9296849e-01,  2.3040051e+00,  3.7113893e-01,
       -1.3672287e+00,  1.5595134e+00,  1.0433791e+00,  3.0832997e-01,
        2.0558093e+00, -2.9601581e+00,  7.8482151e-01, -5.5296445e-01,
        2.7337825e-01, -1.7974573e+00,  6.9916807e-02, -1.7238606e+00,
       -1.7326745e-01,  1.1214123e+00, -2.2292212e-01, -1.4287496e+00,
       -8.7346584e-01, -1.5866026e+00, -4.0783310e-01,  2.3133293e-01,
      

In [12]:
# Find the top-N most similar keys
sims = model.wv.most_similar('方案', topn=10)
sims

[('细则', 0.6278690099716187),
 ('计划', 0.6158345341682434),
 ('预案', 0.6070582270622253),
 ('解决方案', 0.6036876440048218),
 ('设计方案', 0.5521835088729858),
 ('规划', 0.5518771409988403),
 ('举措', 0.5470309257507324),
 ('时间表', 0.5459707379341125),
 ('新规', 0.5441619753837585),
 ('步骤', 0.5423238277435303)]

In [13]:
# Compute cosine distance between two keys
model.wv.distance("方案", "计划")

0.3841654062271118

### 增量学习

In [14]:
sentences_val = [document.split(' ') for document in val['content'].values]

# Load a previously saved Word2Vec model.
model_new = Word2Vec.load("cnew_200")

# Build vocabulary from a sequence of sentences (can be a once-only generator stream).
model_new.build_vocab(sentences_val,
                      #  If true, the new words in `sentences` will be added to model's vocab.
                      update=True)
# Update the model's neural weights from a sequence of sentences
trained_word_count, raw_word_count = model_new.train(corpus_iterable=sentences_val, total_examples=model.corpus_count, epochs=model.epochs)

In [15]:
# 增量学习成功
model_new.wv.most_similar('方案', topn=10)

[('细则', 0.6041954159736633),
 ('预案', 0.5758479237556458),
 ('计划', 0.5653831362724304),
 ('时间表', 0.5599968433380127),
 ('解决方案', 0.5414493680000305),
 ('实施方案', 0.533754825592041),
 ('政策措施', 0.5297344326972961),
 ('步骤', 0.5195085406303406),
 ('设计方案', 0.5184329748153687),
 ('新规', 0.5169488191604614)]