In [43]:
import numpy as np
import jieba
import tensorflow as tf
import random
import math
import time
from scipy.sparse import csr_matrix
from collections import Counter
from scipy.spatial.distance import cosine

#### 数据处理

##### 检测英文

In [4]:
def check_word(check_str):
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False
print(check_word('田东亚'))
print(check_word('math'))

True
False


##### 加载数据并且去掉词频小于3和词频高于阈值的词

In [5]:
with open("wiki_temp.txt",'r',encoding = 'utf-8') as f0:
    words = []
    t = 1e-3
    for line in f0.readlines():
        temp_line = line.split(' ')
        words += temp_line
        
    print(len(words))
    
    word_counts = Counter(words)
    temp_word = [word for word in words if word_counts[word] > 3 and len(word) > 0 and check_word(word)]
    print(len(temp_word))
    print(temp_word[:10])
    
    result_word = []
    temp_counts = Counter(temp_word)
    for word in temp_word:
        p_w = 1 - np.sqrt(t / (temp_counts[word]/len(temp_word)))
        #print(p_w)
        if p_w <= 0.8:
            result_word.append(word)
    print(len(result_word))
    print(result_word[:10])

43848
15508
['世纪', '的', '古希腊', '数学家', '现在', '被', '认为', '是', '几何', '之']
13799
['世纪', '古希腊', '数学家', '现在', '被', '认为', '是', '几何', '之', '为']


##### 构建映射（词和index以及index和词）

In [7]:
V = set(result_word)
word_index = {word: index for index, word in enumerate(V)}
index_word = {index: word for index, word in enumerate(V)}

train_word = [word_index[w] for w in result_word]
print(len(train_word))
print(train_word[:10])

13799
[654, 926, 708, 439, 536, 548, 809, 37, 531, 362]


##### 构建标签

In [36]:
def get_targets(words, index, window_size=2):
    targets = []
    forword = []
    backword = []
    if index - window_size > 0 and index + window_size < len(words):
        forword = words[index-window_size : index]
        backword = words[index+1 : index+window_size+1]
        targets = forword + backword
    
    return targets

In [29]:
lable1 = get_targets(train_word,5,2)
print(lable1)

[536, 809, 37, 439]


In [35]:
lable1 = get_targets(train_word,5,2)
print(train_word[5-1])
print(lable1)

536
[439, 536, 809, 37]


##### 构建数据集

In [19]:
def get_train_data(train_word,batch_size):
    x = []
    y = []
    train_data = []
    index = random.sample(range(0,len(train_word)),batch_size)
    #print(index[:10])
    for i in index:
        lable = get_targets(train_word,i,2)
        x = [train_word[i]] * len(lable)
        y = (lable)
        train_data.append([x,y])
    return train_data

In [22]:
train_data = get_train_data(train_word,10)
print(train_data[3])

[[177, 177, 177, 177], [144, 453, 958, 903]]


#### 构建神经网络

In [25]:
#创建图
train_graph = tf.Graph()
with train_graph.as_default():
    #占位inputs和labels是训练数据
    inputs = tf.placeholder(tf.int32, shape=[None], name='inputs')
    labels = tf.placeholder(tf.int32, shape=[None, None], name='labels')
    tests = tf.placeholder(tf.int32,shape = [None],name = 'tests')
    
    #构建权重矩阵
    vocab_size = len(train_word)
    embedding_size = 100 # 嵌入维度
    embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs)
    
    #构建softmax
    n_sampled = 5
    softmax_w = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev=0.1))
    softmax_b = tf.Variable(tf.zeros(vocab_size))
    
    # 计算negative sampling下的损失
    #loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, labels, embed, n_sampled, vocab_size)
    cost = tf.reduce_mean(tf.nn.sampled_softmax_loss(softmax_w, softmax_b, labels, embed, n_sampled, vocab_size))
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    saver = tf.train.Saver()

##### 创建会话

In [38]:
with tf.Session(graph=train_graph) as sess:
    epochs = 10
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())
    print("开始训练：")
    for e in range(1, epochs+1):
        batches = get_train_data(train_word, 300)
        start = time.time()
        for x, y in batches:
            #print(x.toarray()[0])
            #print(y.toarray()[0])
            feed = {inputs: x,
                    labels: np.array(y)[:, None]}
            train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            loss += train_loss
            
            if iteration % 300 == 0: 
                end = time.time()
                print("Epoch {}/{}".format(e, epochs),"Iteration: {}".format(iteration),"Avg. Training loss: {:.4f}".format(loss/300),"{:.4f} sec/batch".format((end-start)/300))
                loss = 0
                start = time.time()
            
            iteration += 1
        #归一化
        norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
        normalized_embedding = embedding / norm
    print("训练结束！")
    embed_mat = sess.run(normalized_embedding)
    save_path = saver.save(sess, "checkpoints/tdy.ckpt")

开始训练：
Epoch 1/10 Iteration: 300 Avg. Training loss: nan 0.0252 sec/batch
Epoch 2/10 Iteration: 600 Avg. Training loss: 2.8024 0.0248 sec/batch
Epoch 3/10 Iteration: 900 Avg. Training loss: 2.7043 0.0255 sec/batch
Epoch 4/10 Iteration: 1200 Avg. Training loss: 2.6733 0.0249 sec/batch
Epoch 5/10 Iteration: 1500 Avg. Training loss: 2.6583 0.0233 sec/batch
Epoch 6/10 Iteration: 1800 Avg. Training loss: 2.5222 0.0277 sec/batch
Epoch 7/10 Iteration: 2100 Avg. Training loss: 2.5819 0.0255 sec/batch
Epoch 8/10 Iteration: 2400 Avg. Training loss: 2.6101 0.0249 sec/batch
Epoch 9/10 Iteration: 2700 Avg. Training loss: 2.5356 0.0243 sec/batch
Epoch 10/10 Iteration: 3000 Avg. Training loss: 2.3549 0.0258 sec/batch
训练结束！


##### 结果&计算相似度

In [45]:
def sim(s1,s2):
    with tf.Session(graph=train_graph) as sess:
        model_file=tf.train.latest_checkpoint('checkpoints/')
        saver.restore(sess,model_file)
        valid_embedding = tf.nn.embedding_lookup(normalized_embedding,word_index[s1])
        #print(valid_embedding.eval())
        valid_embedding1 = tf.nn.embedding_lookup(normalized_embedding,word_index[s2])
        #print(valid_embedding1.eval())
        vec1 = valid_embedding.eval()
        vec2 = valid_embedding1.eval()
        print("{} & {} 之间的相似度为：{}".format(s1,s2,cosine(vec1, vec2)))

In [47]:
sim("数学家","哲学家")

INFO:tensorflow:Restoring parameters from checkpoints/tdy.ckpt
数学家 & 哲学家 之间的相似度为：0.843086913228035
