In [1]:
import tensorflow as tf
import numpy as np
import math
import collections
import pickle as pkl
from pprint import pprint
import re
import os.path as path
import os
import pandas as pd

learn_rate    = 0.05
embedding_size=16
window_size=4

class word2vec():
    def __init__(self,
                 vocab_list=None,
                 embedding_size=embedding_size,
                 win_len=window_size, # 单边窗口长
                 num_sampled=1000,
                 learning_rate=1.0,
                 logdir='/tmp/simple_word2vec',
                 model_path= None
                 ):

        # 获得模型的基本参数
        self.batch_size     = None # 一批中数据个数, 目前是根据情况来的
        if model_path!=None:
            self.load_model(model_path)
        else:
            # model parameters
            assert type(vocab_list)==list
            self.vocab_list     = vocab_list
            self.vocab_size     = vocab_list.__len__()
            self.embedding_size = embedding_size
            self.win_len        = win_len
            self.num_sampled    = num_sampled
            self.learning_rate  = learning_rate
            self.logdir         = logdir

            self.word2id = {}   # word => id 的映射
            for i in range(self.vocab_size):
                self.word2id[self.vocab_list[i]] = i

            # train times
            self.train_words_num = 0 # 训练的单词对数
            self.train_sents_num = 0 # 训练的句子数
            self.train_times_num = 0 # 训练的次数（一次可以有多个句子）

            # train loss records
            self.train_loss_records = collections.deque(maxlen=10) # 保存最近10次的误差
            self.train_loss_k10 = 0

        self.build_graph()
        self.init_op()
        if model_path!=None:
            tf_model_path = os.path.join(model_path,'tf_vars')
            self.saver.restore(self.sess,tf_model_path)

    def init_op(self):
        self.sess = tf.Session(graph=self.graph)
        self.sess.run(self.init)
        self.summary_writer = tf.summary.FileWriter(self.logdir, self.sess.graph)

    def build_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size])
            self.train_labels = tf.placeholder(tf.int32, shape=[self.batch_size, 1])
            self.embedding_dict = tf.Variable(
                tf.random_uniform([self.vocab_size,self.embedding_size],-1.0,1.0)
            )
            self.nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embedding_size],
                                                              stddev=1.0/math.sqrt(self.embedding_size)))
            self.nce_biases = tf.Variable(tf.zeros([self.vocab_size]))

            # 将输入序列向量化
            embed = tf.nn.embedding_lookup(self.embedding_dict, self.train_inputs) # batch_size

            # 得到NCE损失
            self.loss = tf.reduce_mean(
                tf.nn.nce_loss(
                    weights = self.nce_weight,
                    biases = self.nce_biases,
                    labels = self.train_labels,
                    inputs = embed,
                    num_sampled = self.num_sampled,
                    num_classes = self.vocab_size
                )
            )

            # tensorboard 相关
            tf.summary.scalar('loss',self.loss)  # 让tensorflow记录参数

            # 根据 nce loss 来更新梯度和embedding
            self.train_op = tf.train.GradientDescentOptimizer(learning_rate=learn_rate).minimize(self.loss)  # 训练操作

            # 计算与指定若干单词的相似度
            self.test_word_id = tf.placeholder(tf.int32,shape=[None])
            vec_l2_model = tf.sqrt(  # 求各词向量的L2模
                tf.reduce_sum(tf.square(self.embedding_dict),1,keep_dims=True)
            )

            avg_l2_model = tf.reduce_mean(vec_l2_model)
            tf.summary.scalar('avg_vec_model',avg_l2_model)

            self.normed_embedding = self.embedding_dict / vec_l2_model
            # self.embedding_dict = norm_vec # 对embedding向量正则化
            test_embed = tf.nn.embedding_lookup(self.normed_embedding, self.test_word_id)
            self.similarity = tf.matmul(test_embed, self.normed_embedding, transpose_b=True)

            # 变量初始化
            self.init = tf.global_variables_initializer()

            self.merged_summary_op = tf.summary.merge_all()

            self.saver = tf.train.Saver()

    def train_by_sentence(self, input_sentence=[]):
        #  input_sentence: [sub_sent1, sub_sent2, ...]
        # 每个sub_sent是一个单词序列，例如['这次','大选','让']
        sent_num = input_sentence.__len__()
        batch_inputs = []
        batch_labels = []
        for sent in input_sentence:
            for i in range(sent.__len__()):
                start = max(0,i-self.win_len)
                end = min(sent.__len__(),i+self.win_len+1)
                for index in range(start,end):
                    if index == i:
                        continue
                    else:
                        input_id = self.word2id.get(sent[i])
                        label_id = self.word2id.get(sent[index])
                        if not (input_id and label_id):
                            continue
                        batch_inputs.append(input_id)
                        batch_labels.append(label_id)
        if len(batch_inputs)==0:
            return
        batch_inputs = np.array(batch_inputs,dtype=np.int32)
        batch_labels = np.array(batch_labels,dtype=np.int32)
        batch_labels = np.reshape(batch_labels,[batch_labels.__len__(),1])

        feed_dict = {
            self.train_inputs: batch_inputs,
            self.train_labels: batch_labels
        }
        _, loss_val, summary_str = self.sess.run([self.train_op,self.loss,self.merged_summary_op], feed_dict=feed_dict)

        # train loss
        self.train_loss_records.append(loss_val)
        # self.train_loss_k10 = sum(self.train_loss_records)/self.train_loss_records.__len__()
        self.train_loss_k10 = np.mean(self.train_loss_records)
        if self.train_sents_num % 1000 == 0 :
            self.summary_writer.add_summary(summary_str,self.train_sents_num)
            print("{a} sentences dealed, loss: {b}"
                  .format(a=self.train_sents_num,b=self.train_loss_k10))

        # train times
        self.train_words_num += batch_inputs.__len__()
        self.train_sents_num += input_sentence.__len__()
        self.train_times_num += 1

    def cal_similarity(self,test_word_id_list,top_k=10):
        sim_matrix = self.sess.run(self.similarity, feed_dict={self.test_word_id:test_word_id_list})
        sim_mean = np.mean(sim_matrix)
        sim_var = np.mean(np.square(sim_matrix-sim_mean))
        test_words = []
        near_words = []
        for i in range(test_word_id_list.__len__()):
            test_words.append(self.vocab_list[test_word_id_list[i]])
            nearst_id = (-sim_matrix[i,:]).argsort()[1:top_k+1]
            nearst_word = [self.vocab_list[x] for x in nearst_id]
            near_words.append(nearst_word)
        print near_words
        return test_words,near_words,sim_mean,sim_var

    def save_model(self, save_path):
        #2017/12/24
        import pandas as pd
        print len(self.vocab_list)
        print len(self.sess.run(self.embedding_dict))
        print type(self.vocab_list)
        print len(list(self.sess.run(self.embedding_dict)))
        my_save = pd.DataFrame({'chars':self.vocab_list,'embedding':list(self.sess.run(self.embedding_dict))})
        my_save.to_csv('temp/char_embedding.csv',index=False,sep=',',encoding='utf-8')
        
        
        
        if os.path.isfile(save_path):
            raise RuntimeError('the save path should be a dir')
        if not os.path.exists(save_path):
            os.mkdir(save_path)

        # 记录模型各参数
        model = {}
        var_names = ['vocab_size',      # int       model parameters
                     'vocab_list',      # list
                     'learning_rate',   # int
                     'word2id',         # dict
                     'embedding_size',  # int
                     'logdir',          # str
                     'win_len',         # int
                     'num_sampled',     # int
                     'train_words_num', # int       train info
                     'train_sents_num', # int
                     'train_times_num', # int
                     'train_loss_records',  # int   train loss
                     'train_loss_k10',  # int
                     ]
        for var in var_names:
            model[var] = eval('self.'+var)

        param_path = os.path.join(save_path,'params.pkl')
        if os.path.exists(param_path):
            os.remove(param_path)
        with open(param_path,'wb') as f:
            pkl.dump(model,f)

        # 记录tf模型
        tf_path = os.path.join(save_path,'tf_vars')
        if os.path.exists(tf_path):
            os.remove(tf_path)
        self.saver.save(self.sess,tf_path)
        

    def load_model(self, model_path):
        if not os.path.exists(model_path):
            raise RuntimeError('file not exists')
        param_path = os.path.join(model_path,'params.pkl')
        with open(param_path,'rb') as f:
            model = pkl.load(f)
            self.vocab_list = model['vocab_list']
            self.vocab_size = model['vocab_size']
            self.logdir = model['logdir']
            self.word2id = model['word2id']
            self.embedding_size = model['embedding_size']
            self.learning_rate = model['learning_rate']
            self.win_len = model['win_len']
            self.num_sampled = model['num_sampled']
            self.train_words_num = model['train_words_num']
            self.train_sents_num = model['train_sents_num']
            self.train_times_num = model['train_times_num']
            self.train_loss_records = model['train_loss_records']
            self.train_loss_k10 = model['train_loss_k10']

if __name__=='__main__':




    #step1读取所有中文字符
    allchar_txt = open("chars_all.txt")
    line = allchar_txt.readline().decode('utf-8')
    allchar_txt.close()
    chars_all = set(line)-set(u',\{\} ')
    chars_all =chars_all | set(u'，。？！')
    char_list = [x for x in chars_all]
    print('文本中总共有{n1}个中文字符,全部字符进入字典'.format(n1=len(char_list)))
          
    # step2 读取文本，预处理
    sentence_list = []
    data = pd.read_csv("new_chinese_train.csv", encoding='utf-8',header=0)
    for label, content in zip(data.classes, data.content):
        sentence_list.append(list(content))
    print('一共有{n}个句子'.format(n=len(sentence_list)))



    # 创建模型，训练
    w2v = word2vec(vocab_list=char_list,    # 字典集
                   embedding_size=16,
                   win_len=4,
                   learning_rate=1,
                   num_sampled=100,         # 负采样个数
                   logdir='/tmp/280')       # tensorboard记录地址

        

    test_word = [u'头',u'手',u'机',u'长',u'尊',u'皱',u'吃',u'喝',u'中',u'英',u'胜',u'东',u'男',u'蓝',u'南',u'地',u'的',u'育',u'酒',u'救']
    test_id = [char_list.index(x) for x in test_word]
    num_steps = 1000000
    for i in range(num_steps):
        sent = sentence_list[i%len(sentence_list)]
        w2v.train_by_sentence([sent])
    near_list = w2v.cal_similarity(test_id)[1]
    i=0
    for simlist in near_list:
        print test_word[i].encode('utf-8')
        print (simlist[0].encode('utf-8')+' '+simlist[1].encode('utf-8'),simlist[2].encode('utf-8'),simlist[3].encode('utf-8'),simlist[4].encode('utf-8'),simlist[5].encode('utf-8'),simlist[6].encode('utf-8'),simlist[6].encode('utf-8'))
        i += 1
    w2v.save_model('temp/')

文本中总共有6756个中文字符,全部字符进入字典
一共有490259个句子
0 sentences dealed, loss: 319.42376709
1000 sentences dealed, loss: 166.099380493
2000 sentences dealed, loss: 84.8965911865
3000 sentences dealed, loss: 32.1872406006
4000 sentences dealed, loss: 25.3989276886
5000 sentences dealed, loss: 19.6243610382
6000 sentences dealed, loss: 14.8132171631
7000 sentences dealed, loss: 11.492647171
8000 sentences dealed, loss: 9.64089870453
9000 sentences dealed, loss: 8.96515655518
10000 sentences dealed, loss: 8.74445152283
11000 sentences dealed, loss: 6.27566814423
12000 sentences dealed, loss: 7.05991363525
13000 sentences dealed, loss: 7.9525809288
14000 sentences dealed, loss: 5.72894859314
15000 sentences dealed, loss: 7.75637340546
16000 sentences dealed, loss: 4.61510181427
17000 sentences dealed, loss: 4.04460573196
18000 sentences dealed, loss: 5.63016414642
19000 sentences dealed, loss: 4.59554672241
20000 sentences dealed, loss: 4.15906858444
21000 sentences dealed, loss: 4.20365142822
22000 sent

185000 sentences dealed, loss: 2.89502763748
186000 sentences dealed, loss: 2.87102651596
187000 sentences dealed, loss: 3.02389168739
188000 sentences dealed, loss: 3.1589486599
189000 sentences dealed, loss: 2.88454389572
190000 sentences dealed, loss: 2.69530701637
191000 sentences dealed, loss: 2.90977215767
192000 sentences dealed, loss: 3.27319598198
193000 sentences dealed, loss: 3.19958162308
194000 sentences dealed, loss: 3.32260322571
195000 sentences dealed, loss: 3.16411924362
196000 sentences dealed, loss: 2.95315742493
197000 sentences dealed, loss: 2.96916389465
198000 sentences dealed, loss: 2.96893024445
199000 sentences dealed, loss: 2.91722989082
200000 sentences dealed, loss: 3.02858304977
201000 sentences dealed, loss: 3.37296867371
202000 sentences dealed, loss: 3.0125207901
203000 sentences dealed, loss: 2.65308618546
204000 sentences dealed, loss: 2.26431417465
205000 sentences dealed, loss: 3.32464146614
206000 sentences dealed, loss: 2.65362691879
207000 sente

368000 sentences dealed, loss: 1.75183558464
369000 sentences dealed, loss: 1.16776549816
370000 sentences dealed, loss: 1.59704661369
371000 sentences dealed, loss: 1.75757443905
372000 sentences dealed, loss: 1.68676304817
373000 sentences dealed, loss: 1.33520317078
374000 sentences dealed, loss: 1.65106928349
375000 sentences dealed, loss: 1.47059953213
376000 sentences dealed, loss: 1.27545464039
377000 sentences dealed, loss: 0.974314033985
378000 sentences dealed, loss: 1.40367519855
379000 sentences dealed, loss: 0.850209832191
380000 sentences dealed, loss: 1.47689330578
381000 sentences dealed, loss: 1.62247526646
382000 sentences dealed, loss: 1.12117123604
383000 sentences dealed, loss: 1.40801918507
384000 sentences dealed, loss: 1.79937744141
385000 sentences dealed, loss: 1.10926568508
386000 sentences dealed, loss: 1.648889184
387000 sentences dealed, loss: 1.05383527279
388000 sentences dealed, loss: 0.728933334351
389000 sentences dealed, loss: 1.86671710014
390000 se

550000 sentences dealed, loss: 2.57941842079
551000 sentences dealed, loss: 2.69429898262
552000 sentences dealed, loss: 2.67031979561
553000 sentences dealed, loss: 2.91149735451
554000 sentences dealed, loss: 2.9244787693
555000 sentences dealed, loss: 2.81034970284
556000 sentences dealed, loss: 2.71847605705
557000 sentences dealed, loss: 2.99836444855
558000 sentences dealed, loss: 2.60815668106
559000 sentences dealed, loss: 2.85789585114
560000 sentences dealed, loss: 3.01768183708
561000 sentences dealed, loss: 2.52918720245
562000 sentences dealed, loss: 2.70776891708
563000 sentences dealed, loss: 2.82660222054
564000 sentences dealed, loss: 2.91375470161
565000 sentences dealed, loss: 2.53144025803
566000 sentences dealed, loss: 2.86877584457
567000 sentences dealed, loss: 2.97989416122
568000 sentences dealed, loss: 2.53290343285
569000 sentences dealed, loss: 2.5851650238
570000 sentences dealed, loss: 2.4205892086
571000 sentences dealed, loss: 3.27186536789
572000 senten

733000 sentences dealed, loss: 2.3186173439
734000 sentences dealed, loss: 2.79251813889
735000 sentences dealed, loss: 2.31188726425
736000 sentences dealed, loss: 2.3840174675
737000 sentences dealed, loss: 2.46898078918
738000 sentences dealed, loss: 2.64057207108
739000 sentences dealed, loss: 2.09118795395
740000 sentences dealed, loss: 2.50765562057
741000 sentences dealed, loss: 2.14096498489
742000 sentences dealed, loss: 2.74138998985
743000 sentences dealed, loss: 2.45979809761
744000 sentences dealed, loss: 2.68458628654
745000 sentences dealed, loss: 1.8776652813
746000 sentences dealed, loss: 2.52517604828
747000 sentences dealed, loss: 2.19527935982
748000 sentences dealed, loss: 1.77589154243
749000 sentences dealed, loss: 2.22947788239
750000 sentences dealed, loss: 2.63180589676
751000 sentences dealed, loss: 2.32637214661
752000 sentences dealed, loss: 2.65154051781
753000 sentences dealed, loss: 2.21066522598
754000 sentences dealed, loss: 2.47215390205
755000 senten

915000 sentences dealed, loss: 1.57037270069
916000 sentences dealed, loss: 1.27569937706
917000 sentences dealed, loss: 0.601967275143
918000 sentences dealed, loss: 1.18414473534
919000 sentences dealed, loss: 1.31255269051
920000 sentences dealed, loss: 0.780288159847
921000 sentences dealed, loss: 0.830997645855
922000 sentences dealed, loss: 1.33247625828
923000 sentences dealed, loss: 1.53611671925
924000 sentences dealed, loss: 1.31761789322
925000 sentences dealed, loss: 1.40636563301
926000 sentences dealed, loss: 1.16915416718
927000 sentences dealed, loss: 1.04609584808
928000 sentences dealed, loss: 1.32991051674
929000 sentences dealed, loss: 0.711790978909
930000 sentences dealed, loss: 0.305884212255
931000 sentences dealed, loss: 1.16384208202
932000 sentences dealed, loss: 1.57277071476
933000 sentences dealed, loss: 0.924256026745
934000 sentences dealed, loss: 1.41814887524
935000 sentences dealed, loss: 1.41026699543
936000 sentences dealed, loss: 1.6938803196
93700