# 基于tensorflow实现word2vec

In [1]:
import collections
import os
import random
import urllib
import zipfile
import numpy as np
import tensorflow as tf

In [2]:
# 微调参数
learning_rate = 0.1
batch_size = 128
num_steps = 1000000
display_step = 20000
eval_step = 100000

# 测试样例
eval_words = ['nine', 'of', 'going', 'girl', 'love']

# Word2Vec参数
embedding_size = 200 # 词向量维度
max_vocabulary_size = 50000  # 语料库词语数
min_occurrence = 10 # 最小词频，少于这个数的不要
skip_window = 3     # 左右窗口大小，两边是3*2，加一个训练词，窗口是7
num_skips = 2     # 一次制作多少个输入输出对
num_sampled = 64     # 负采样


In [3]:
# 加载训练数据，什么数据都可以
data_path = "./data/test.txt"
with open(data_path, 'rb') as f:
    text_words = f.read().lower().split()

In [4]:
len(text_words)

1367297

In [5]:
# 创建一个计数器，计算每个词出现了多少次
count = [('UNK', -1)]
# 基于词频返回max_vocabulary_size个常用词
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size-1))

In [6]:
count[:10]

[('UNK', -1),
 (b'the', 57024),
 (b'to', 41289),
 (b'and', 39944),
 (b'of', 35950),
 (b'a', 28947),
 (b'i', 28097),
 (b'in', 22601),
 (b'that', 17328),
 (b'he', 17089)]

In [7]:
# 剔除掉出现次数少于min_occurrence
for i in range(len(count)-1, -1, -1):    # 从start到end每次step多少
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # 判断是，从小到大排序得，所以跳出得时候剩下得都是满足条件的
        break

In [8]:
len(count)

9178

# 词-ID映射

In [9]:
# 计算语料库大小
vocabulary_size = len(count)
# 每个词都分配一个id
word2id = dict()
for i, (word, _) in enumerate(count):
    word2id[word] = i

In [10]:
word2id

{'UNK': 0,
 b'the': 1,
 b'to': 2,
 b'and': 3,
 b'of': 4,
 b'a': 5,
 b'i': 6,
 b'in': 7,
 b'that': 8,
 b'he': 9,
 b'was': 10,
 b'his': 11,
 b'with': 12,
 b'it': 13,
 b'had': 14,
 b'as': 15,
 b'her': 16,
 b'my': 17,
 b'you': 18,
 b'for': 19,
 b'have': 20,
 b'at': 21,
 b'she': 22,
 b'be': 23,
 b'not': 24,
 b'is': 25,
 b'but': 26,
 b'on': 27,
 b'mr.': 28,
 b'which': 29,
 b'said': 30,
 b'this': 31,
 b'by': 32,
 b'so': 33,
 b'all': 34,
 b'me': 35,
 b'from': 36,
 b'if': 37,
 b'him': 38,
 b'would': 39,
 b'when': 40,
 b'been': 41,
 b'what': 42,
 b'we': 43,
 b'an': 44,
 b'no': 45,
 b'very': 46,
 b'were': 47,
 b'or': 48,
 b'who': 49,
 b'will': 50,
 b'are': 51,
 b'little': 52,
 b'any': 53,
 b'your': 54,
 b'they': 55,
 b'than': 56,
 b'there': 57,
 b'one': 58,
 b'more': 59,
 b'could': 60,
 b'some': 61,
 b'into': 62,
 b'am': 63,
 b'out': 64,
 b'should': 65,
 b'never': 66,
 b'do': 67,
 b'upon': 68,
 b'much': 69,
 b'such': 70,
 b'up': 71,
 b'about': 72,
 b'their': 73,
 b'good': 74,
 b'know': 75,
 b'has

# 所有词转换成ID

In [11]:
word2id.get(b'easily', 0)

591

In [12]:
data = list()
unk_count = 0
for word in text_words:
    # 全部转换成id
    index = word2id.get(word, 0)
    if index == 0:
        unk_count += 1
    data.append(index)
count[0] = ('UNK', unk_count)
id2word = dict(zip(word2id.values(), word2id.keys()))

print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("Vocabulary size:", vocabulary_size)
print("Most common words:", count[:10])

Words count: 1367297
Unique words: 76390
Vocabulary size: 9178
Most common words: [('UNK', 135224), (b'the', 57024), (b'to', 41289), (b'and', 39944), (b'of', 35950), (b'a', 28947), (b'i', 28097), (b'in', 22601), (b'that', 17328), (b'he', 17089)]


# 构建所需训练数据

In [13]:
data_index = 0

def next_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    # get window size (words left and right + current one)
    span = 2 * skip_window + 1  # 7为窗口，左3右3中间1
    buffer = collections.deque(maxlen=span)   # 创建一个长度为7的队列
    if data_index + span > len(data):       # 如果数据被滑完一遍了，就置为0，重新开始
        data_index = 0
    buffer.extend(data[data_index:data_index+span])      # 队列存的是当前窗口，例如deque（[28, 75, 37, 6, 195, 2, 343], maxlen=7）,deque存的是id
    data_index += span
    for i in range(batch_size // num_skips):       # num_skip表示取多少组不同的词作为输出，此例为2
        context_words = [w for w in range(span) if w != skip_window]  # 上下文就是[0,1,2,4,5,6],每次循环，只是里面的数变了
        words_to_use = random.sample(context_words, num_skips)   # 在上下文里随机选2个候选词
        for j, context_words in enumerate(words_to_use): # 遍历每一个候选词，用其当作输出也就是标签
            batch[i * num_skips + j] = buffer[skip_window]   # 输入都为当前窗口的中间此，即3
            labels[i * num_skips + j, 0] = buffer[context_words]   # 用当前候选词当作标签
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])  # 之前已经传入7歌词了，窗口要右移了，例如[28, 75, 37, 6, 195, 2, 343]-》[75, 37, 6, 195, 2, 343，124]
    
   # (7+10000-2)%10000，净搞这种原本很简单，加上之后让人看不懂的写法
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [14]:
random.sample([0,1,2,4,5,6], 2) 

[0, 6]

In [15]:
vocabulary_size, embedding_size

(9178, 200)

In [16]:
with tf.device('/cpu:0'):
    embedding = tf.Variable(tf.random.normal([vocabulary_size, embedding_size]))   # 维度(9178, 200)
    nce_weights = tf.Variable(tf.random.normal([vocabulary_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# 通过tf.nn.embedding_lookup函数将索引转换成词向量

In [17]:
def get_embedding(x):
    with tf.device('/cpu:0'):
        # 将x对应的embedding向量从embedding里面拿出来
        x_embed = tf.nn.embedding_lookup(embedding, x)
        return x_embed

# 损失函数定义
先分别计算出正样本和采样出的负样本对应的output和label\
再通过sigmoid cross entropy 来极端output和label的loss

In [18]:
def nce_loss(x_embed, y):
    with tf.device('/cpu:0'):
        y = tf.cast(y, tf.int64)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=y,
                           inputs=x_embed,
                           num_sampled=num_sampled,   # 采样出多少个负样本
                           num_classes=vocabulary_size
                          )
        )

        return loss

# 测试观察模块

In [19]:
def evaluate(x_embed):
    with tf.device('/cpu:0'):
        x_embed = tf.cast(x_embed, tf.float32)
        x_embed_norm = x_embed / tf.sqrt(tf.reduce_sum(tf.square(x_embed)))   # 归一化
        embedding_norm = embedding / tf.sqrt(tf.reduce_mean(tf.square(embedding), 1, keepdims=True), tf.float32)  # 全部向量的
        cosine_sim_op = tf.matmul(x_embed_norm, embedding_norm, transpose_b=True)  # 计算余弦相似度
        return cosine_sim_op
    
    # SGD
optimizer = tf.optimizers.SGD(learning_rate)

In [20]:
# 迭代优化
def run_oprimization(x, y):
    with tf.device('/gpu:0'):
        with tf.GradientTape() as g:
            emb = get_embedding(x)
            loss = nce_loss(emb, y)
            
        # 计算梯度
        gradients = g.gradient(loss, [embedding, nce_weights, nce_biases])
        
        # 更新
        optimizer.apply_gradients(zip(gradients, [embedding, nce_weights, nce_biases]))


In [21]:
# 待测试的几个词
x_test = np.array([word2id[w.encode('utf-8')] for w in eval_words])

# 训练
for step in range(1, num_steps + 1):
    batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)
    run_oprimization(batch_x, batch_y)
    
    if step % display_step == 0 or step == 1:
        loss = nce_loss(get_embedding(batch_x), batch_y)
        print("step: %i, loss: %f" % (step, loss))
        
    if step % eval_step == 0 or step == 1:
        print("Evaluation...")
        sim = evaluate(get_embedding(x_test)).numpy()
        for i in range(len(eval_words)):
            topk = 8  # 返回前8个最相似的
            nearest = (-sim[i, :]).argsort()[1:topk + 1]
            log_str = '"%s" nearst neighbors:' % eval_words[i]
            for k in range(topk):
                log_str = '%s %s,' % (log_str, id2word[nearest[k]])
                
            print(log_str)

step: 1, loss: 411.873840
Evaluation...
"nine" nearst neighbors: b'believes', b'combined', b'harsh', b'refused', b'\xe2\x80\x98all', b'fact,', b'going.', b'request',
"of" nearst neighbors: b'distance', b'chamber,', b'happy', b'jarndyce,', b'hospital,', b'disposition', b'begged', b'(as',
"going" nearst neighbors: b'general,', b'drawers', b'except', b'nephew,', b'sore', b'tea', b'horrible', b'stayed',
"girl" nearst neighbors: b'adore', b'flite,', b'degrees,', b'become', b'other."', b'room.', b'\xe2\x80\x9ci', b'hinder',
"love" nearst neighbors: b'thwackum,', b'garden', b'northerton', b'a-going', b"'my", b'trial', b'case.', b'aside',
step: 20000, loss: 0.207698
step: 40000, loss: 0.204800
step: 60000, loss: 0.148185
step: 80000, loss: 0.166082
step: 100000, loss: 0.095348
Evaluation...
"nine" nearst neighbors: b'believes', b'combined', b'harsh', b'refused', b'\xe2\x80\x98all', b'fact,', b'going.', b'request',
"of" nearst neighbors: b'distance', b'chamber,', b'happy', b'jarndyce,', b'hospi

In [None]:
1、云会计模型配置沟通，并在模型平台上配置了3个行业6个财务维度共计18个模型。
2、定时任务相关问题沟通讲解，就git拉入服务器导致的一些跑批任务bug进行修复。
3、