In [125]:
import numpy as np
import tensorflow as tf
import time
import random
import math

Step1: 对于文本的读取以及预处理

In [2]:
## 读取text文档
f = open('peotryFromTang.txt',encoding = 'gbk')
lines = f.readlines()
f.close()

In [3]:
## 预处理一：去除字符串中间的'\n',合并同一首诗的字符串
peotry = []
item = ''
for i in range(len(lines)):
    if (lines[i] != '\n'):
        item += lines[i].replace('\n','')
    else:
        peotry.append(item)
        item = ''

peotry.remove('')
    


In [84]:
## 预处理二：按照句号划分每条输入
peotry_new = []
for p in peotry:
    li = p.split('。')
    for s in li:
        if (s != ''):
            peotry_new.append(s)

In [85]:
## 预处理三：按照字符分割每首诗词
peotry_seg = []
for p in peotry_new:
    seg = []
    for ch in p:
        seg.append(ch)
    peotry_seg.append(seg)

In [86]:
## 预处理四：定义词汇表

# 将二维的tokens转成一维tokens
all_characters = []
for items in peotry_seg:
    for item in items:
        all_characters.append(item)

# 生成原始的词汇集合
unique_characters = []
for token in all_characters:
    if token not in unique_characters:
        unique_characters.append(token)
unique_characters.append('_PAD')
unique_characters.append('_UNK')
unique_characters.append('_STA')
unique_characters.append('_END')

# 统计每个词出现的频率
dic = {}
for token in unique_characters:
    dic[token] = all_characters.count(token)
        
        

In [87]:
## 预处理五：计算文本长度，统一添加padding

# 计算每个句子的长度并存储
lens = []
for words in peotry_seg:
    lens.append(len(words))
print(max(lens))

# 补全句子
peotry_pad = []
for words in peotry_seg:
    words.append('_END')
    words.insert(0,'_STA')
    for i in range(max(lens)-len(words)+3):    ## 对于最大长度的句子，也再多补充一个_PAD
        words.append('_PAD')
    peotry_pad.append(words)


lens = np.array(lens)+2

##定义weights——weights：这个比较复杂，对于非PADDING输入1，PADDING输入0
weights = []
for item in lens:
    weights.append(np.append(np.ones(item),np.zeros(max(lens)-item)))

39


In [88]:
## 预处理六：对每个词汇进行编码，并将每个数据进行表征
vocab = {}
for i in range(len(unique_characters)):
    vocab[unique_characters[i]] = i

representations = []
for words in peotry_seg:
    reps = []
    for word in words:
        if (word in unique_characters):
            reps.append(vocab[word])
        else:
            reps.append(vocab['_UNK'])
    representations.append(reps)

In [89]:
## 预处理七：定义x，y
x = []
y = []
for reps in representations:
    x.append(reps[:(len(reps)-1)])
    y.append(reps[1:len(reps)])
    


In [90]:
## 预处理八：划分数据
def data_division(x,y,lens):
    n_train = int(len(x)*0.8)
    n_test = int(len(x)*0.1)
    n_dev = int(len(x) - n_train -n_test)

    index_train = random.sample(range(len(x)),n_train)
    index_dev = random.sample(list(set(range(len(x)))^set(index_train)) ,n_dev)
    index_test = list(set(list(set(range(len(x)))^set(index_train)))^set(index_dev)) 

    x_train = []
    y_train = []
    weights_train = []
    for index in index_train:
        x_train.append(x[index])
        y_train.append(y[index])
        weights_train.append(weights[index])

    x_dev = []
    y_dev = []
    weights_dev = []
    for index in index_dev:
        x_dev.append(x[index])
        y_dev.append(y[index])
        weights_dev.append(weights[index])

    x_test = []
    y_test = []
    weights_test = []
    for index in index_test:
        x_test.append(x[index])
        y_test.append(y[index])
        weights_test.append(weights[index])

    x_train = np.array(x_train)
    y_train = np.array(y_train)

    x_dev = np.array(x_dev)
    y_dev = np.array(y_dev)

    x_test = np.array(x_test)
    y_test = np.array(y_test)
    
    return x_train,y_train,weights_train,x_dev,y_dev,weights_dev,x_test,y_test,weights_test

In [91]:
## 数据类
class data_set(object):
    def __init__(self):
        data = data_division(x,y,lens)
        self.x_train = data[0]
        self.y_train = data[1]
        self.weights_train = data[2]
        self.x_dev = data[3]
        self.y_dev = data[4]
        self.weights_dev = data[5]
        self.x_test = data[6]
        self.y_test = data[7]
        self.weights_test = data[8]            

Step2：RNN语言模型           参考：https://blog.csdn.net/felaim/article/details/70184697    https://www.cnblogs.com/wuzhitj/p/6297992.html

In [180]:
## 首先定义设置类内容
class config(object):
    init_scale = 0.1                            # 相关参数的初始值为随机均匀分布，范围是[-init_scale,+init_scale]
    learning_rate = 1.0                         # 学习速率,在文本循环次数超过max_epoch以后会逐渐降低
    max_grad_norm = 2                           # 用于控制梯度膨胀，如果梯度向量的L2模超过max_grad_norm，则等比例缩小
    num_layers = 2                              # lstm层数
    num_steps = max(lens)                       # 单个数据中，序列的长度。
    hidden_size = 128                           # 隐藏层中单元数目
    max_epoch = 2                               # epoch<max_epoch时，lr_decay值=1,epoch>max_epoch时,lr_decay逐渐减小
    max_max_epoch = 5                           # 指的是整个文本循环次数。
    keep_prob = 0.9                             # 用于dropout.每批数据输入时神经网络中的每个单元会以1-keep_prob的概率不工作，可以防止过拟合
    lr_decay = 0.5                              # 学习速率衰减
    batch_size = 50                             # 每批数据的规模，每批有20个。
    vocab_size = len(unique_characters)         # 词典规模

class val_config(object):
    init_scale = 0.1                            # 相关参数的初始值为随机均匀分布，范围是[-init_scale,+init_scale]
    learning_rate = 1.0                         # 学习速率,在文本循环次数超过max_epoch以后会逐渐降低
    max_grad_norm = 2                           # 用于控制梯度膨胀，如果梯度向量的L2模超过max_grad_norm，则等比例缩小
    num_layers = 2                              # lstm层数
    num_steps = max(lens)                       # 单个数据中，序列的长度。
    hidden_size = 128                           # 隐藏层中单元数目
    max_epoch = 2                               # epoch<max_epoch时，lr_decay值=1,epoch>max_epoch时,lr_decay逐渐减小
    max_max_epoch = 5                           # 指的是整个文本循环次数。
    keep_prob = 0.9                             # 用于dropout.每批数据输入时神经网络中的每个单元会以1-keep_prob的概率不工作，可以防止过拟合
    lr_decay = 0.5                              # 学习速率衰减
    batch_size = 6                              # 每批数据的规模，每批有20个。
    vocab_size = len(unique_characters)         # 词典规模
    
class eval_config(object):
    init_scale = 0.1                            # 相关参数的初始值为随机均匀分布，范围是[-init_scale,+init_scale]
    learning_rate = 1.0                         # 学习速率,在文本循环次数超过max_epoch以后会逐渐降低
    max_grad_norm = 2                           # 用于控制梯度膨胀，如果梯度向量的L2模超过max_grad_norm，则等比例缩小
    num_layers = 2                              # lstm层数
    num_steps = max(lens)                       # 单个数据中，序列的长度。
    hidden_size = 128                           # 隐藏层中单元数目
    max_epoch = 2                               # epoch<max_epoch时，lr_decay值=1,epoch>max_epoch时,lr_decay逐渐减小
    max_max_epoch = 5                           # 指的是整个文本循环次数。
    keep_prob = 0.9                             # 用于dropout.每批数据输入时神经网络中的每个单元会以1-keep_prob的概率不工作，可以防止过拟合
    lr_decay = 0.5                              # 学习速率衰减
    batch_size = 1                              # 每批数据的规模，每批有20个。
    vocab_size = len(unique_characters)         # 词典规模

In [178]:
## 定义输入类内容
class DataInput(object):
    def __init__(self, config, data_set, division,name = None):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        if (division == 'train'):
            self.epoch_size = len(data_set.x_train) // batch_size  ##若输入数据是PTB类型整段文本数据，需要再除以num_steps进行划分，但对于已划分好的数据就不用再除以num_steps了
            self.input_data = data_set.x_train
            self.target_data = data_set.y_train
            self.weights = data_set.weights_train
        elif (division == 'dev'):
            self.epoch_size = len(data_set.x_dev) // batch_size
            self.input_data = data_set.x_dev
            self.target_data = data_set.y_dev
            self.weights = data_set.weights_dev
        else:
            self.epoch_size = len(data_set.x_test) // batch_size
            self.input_data = data_set.x_test
            self.target_data = data_set.y_test
            self.weights = data_set.weights_test


案例中给出的epoch_size需要-1，这里不太明白为什么，本实验未-1

In [150]:
## LSTM模型
class LSTM_LM(object):
    def __init__(self, is_training,config,input_):    ## config中定义模型需要得超参数，与输入无关；input_中定义输入方式以及相关得超参。
        self._input = input_                          
        batch_size = config.batch_size                ## 批大小在input_中，决定通过mini-batch方法每次输入得数据
        num_steps = config.num_steps                  ## 步长对应得史RNN模型中每条输入得长度，这个也与输入有关
        size = config.hidden_size                     ## 隐藏层大小属于设置里面的，决定的是embedding的维度
        vocab_size = config.vocab_size                ## 词汇表大小决定的是embedding矩阵的大小
        

#定义input和target为placeholder
        self._input_x = tf.placeholder(tf.int32, shape = [batch_size,num_steps])
        self._target_y = tf.placeholder(tf.int32, shape = [batch_size,num_steps])
        self._real_weights = tf.placeholder(tf.float32, shape = [batch_size,num_steps])
        
#设置默认的LSTM单元
        def lstm_cell():
            return tf.contrib.rnn.BasicLSTMCell(size, forget_bias = 0.0, state_is_tuple = True)  ## 定义BasicLSTMCell， state_is_tuple参数表明是否拆分输出和记忆为tuple，if False，输出和记忆会被连接
        attn_cell = lstm_cell  ## 这里并不是调用函数，而是将函数赋给一个变量，保证if没运行的时候attn_cell也有值

        if is_training and config.keep_prob < 1:  ## 只有训练时调用dropout，因此用index来指针，keep_prob表示保留的概率 
            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob = config.keep_prob) ## 注意DropoutWrapper函数，output_keep_prob的含义表示保留率
        cell = tf.contrib.rnn.MultiRNNCell([attn_cell() for _ in range(config.num_layers)], state_is_tuple = True) ## for句法的使用，返回列表， 替代表达：[attn_cell()] * config.num_layers
        self._initial_state = cell.zero_state(batch_size, tf.float32) ## 状态初始化，使用的是全0初始化
        
#创建网络的词嵌入的部分
        with tf.device("/cpu:0"):  ## 指定模型运行的设备
            embedding = tf.get_variable("embedding", [vocab_size, size], dtype = tf.float32)
            inputs = tf.nn.embedding_lookup(embedding, self._input_x) ## lookup
        if is_training and config.keep_prob < 1:  ## 注意这里的结构，在embedding层也是用了dropout来进行正则化
            inputs = tf.nn.dropout(inputs, config.keep_prob)

#定义输出
        outputs = []
        state = self._initial_state
        with tf.variable_scope("RNN"):  ## 上下文管理器，在RNN中，所有需要训练的variables都已经被封装到BasicLSTMCell中了
            for time_step in range(num_steps):  ## 该循环是在一层LSTM中对于每个cell进行迭代运算，获取当前位置的输出以及状态
                if time_step > 0:  
                    tf.get_variable_scope().reuse_variables()  ## 当time_step>0时，说明最初的所有变量已经被赋值了，再次赋值的时候需要调用reuse
                (cell_output, state) = cell(inputs[:, time_step, :], state)  ## 三维tensor与inputs相对应
                outputs.append(cell_output)
        
        
        output = tf.reshape(tf.concat(outputs, 1), [-1, size])  ## 这一步是把输出变为二维变量[batch_size*num_steps，size]的形状      
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype = tf.float32)
        softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype = tf.float32)
        logits = tf.matmul(output, softmax_w) + softmax_b  ## Wx+b,计算线性部分，还未计算softmax
        
        weights = tf.reshape(self._real_weights,[-1])
        
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(self._target_y, [-1])],[weights])
        ## tf.contrib.legacy_seq2seq.sequence_loss_by_example()函数进行了如下的计算操作：计算每个sequence在每个位置上的log困惑度
        ## logits:直接输入每个词汇的logits；targets：直接输入每个词汇的targets
        
        
        ## 计算每个句子的困惑度
        log_perplexity = tf.reduce_mean(tf.reshape(loss,[int(np.shape(loss)[0])//num_steps,num_steps]),axis = 1)
        
        self._cost = cost = tf.reduce_sum(log_perplexity) / batch_size  ## 平均log困惑度作为cost
        self._final_state = state  ## 返回计算后的最终状态

        if not is_training:  ## 对于验证集和测试集来说，不需要进行后项过程，到这里就结束了
            return
        
#定义学习率，优化器等
        self._lr = tf.Variable(0.0, trainable = False)  ## 定义学习率为一个变量，但是是不需要训练的变量

        tvars = tf.trainable_variables()  ## tvars通过tf.trainable_variables()获得所有待训练的变量，返回的是列表
                
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm)  
        ## tf.gradients计算梯度，计算cost对于所有tvars中元素的梯度，返回梯度列表
        ## tf.clip_by_global_norm用于梯度修剪，防止梯度爆炸——若梯度超过限值max_grad_norm,则梯度等比缩小
        ## grads就是修建后的梯度值
        
        
        optimizer = tf.train.GradientDescentOptimizer(self._lr)  ## 梯度下降法
        
        
        self._train_op = optimizer.apply_gradients(zip(grads, tvars),global_step = tf.contrib.framework.get_or_create_global_step())
        ## 这一步是应用计算出来的梯度到优化器，这一步+计算梯度≈minimize
        ## global_step学习率变化时常用的参数，用于计数，表征全局步数
        ## tf.contrib.framework.get_or_create_global_step是用来返回或者创建globel_step变量的，初始从1开始；如果想从0开始，就直接定义一个变量tf.Variable(0, trainable=False) 就可以了

        
        ## 以下部分是学习率的更新
        self._new_lr = tf.placeholder(tf.float32, shape = [], name = "new_learning_rate")  ## 这个placeholder是用来存放新新进来的学习率
        self._lr_update = tf.assign(self._lr, self._new_lr)  
        ## 用新的学习率替换原来的学习率,注意，此时的self._lr已经改变，同时self._lr_update被赋予的是一个操作，而不是具体的返回值
        ## 因为tf.assign函数是会直接改变self._lr的操作，而不是返回值

    def assign_lr(self, session, lr_value):  ## 参数是session
        session.run(self._lr_update, feed_dict = {self._new_lr: lr_value})  ## 给新的学习率赋值

#利用@property装饰器可以将返回变量设为只读
    @property
    def input(self):
        return self._input
    
    @property
    def input_x(self):
        return self._input_x

    @property
    def target_y(self):
        return self._target_y
    
    @property
    def real_weights(self):
        return self._real_weights
    
    @property
    def initial_state(self):
        return self._initial_state

    @property
    def cost(self):
        return self._cost

    @property
    def final_state(self):
        return self._final_state

    @property
    def lr(self):
        return self._lr

    @property
    def train_op(self):
        return self._train_op


修改内容：
（1）tf.contrib.legacy_seq2seq.sequence_loss_by_example()函数进行了如下的计算操作：计算每个sequence在每个位置上的log困惑度
logits:直接输入每个词汇的logits；targets：直接输入每个词汇的targets
（2）增加了困惑度的计算perplexity

In [157]:
#定义训练一个epoch数据的函数
def run_epoch(session, model, config, eval_op = None, verbose = False):
    start_time = time.time() ## 记录运行时间
    costs = 0.0
    iters = 0
    state = session.run(model.initial_state)  ## 初始化模型

    fetches = {
            "cost": model.cost,
            "final_state": model.final_state,
            }                              ## 要运行的操作

    if eval_op is not None:
        fetches["eval_op"] = eval_op      ## 对于验证和测试不走反向过程
        

    for step in range(model.input.epoch_size):
        feed_dict = {}  ## 需要定义喂入的数据
        ## 每次输入一批数据
        feed_dict[model.input_x] = model.input.input_data[step*config.batch_size:(step+1)*config.batch_size]
        feed_dict[model.target_y] = model.input.target_data[step*config.batch_size:(step+1)*config.batch_size]
        feed_dict[model.real_weights] = model.input.weights[step*config.batch_size:(step+1)*config.batch_size]
        
        for i, (c, h) in enumerate(model.initial_state):  ## 这里将c，h喂入到神经网络中
            feed_dict[c] = state[i].c
            feed_dict[h] = state[i].h
        ## 将c，h作为喂入的数据，只能理解成在BasicLSTMCell中，c和h都是placeholder，具体还是不太明白，先保留。

        vals = session.run(fetches, feed_dict)

        cost = vals["cost"]

        state = vals["final_state"]

        costs += cost
        iters += 1

        if verbose:
            print ("%.3f perplexity: %.3f speed : %.0f wps" 
                %(step * 1.0 / model.input.epoch_size, np.exp(costs / iters), 
                iters * model.input.batch_size / (time.time() - start_time)))
    print(np.exp(costs / iters))
    return np.exp(costs / iters)

对照源码，以下几个问题不太明白：
（1）关于c，h作为占位符喂入模型的问题；
（2）mini-batch方法中没有体现出分批喂入的代码,所以自行调整了代码显式加入了分批喂入；
（3）这样对于训练、测试、验证分开训练，能不能共享参数——理论上似乎可以。

In [181]:
## 初始化设置
config = config()
val_config = val_config()
eval_config = eval_config()

In [183]:
## 创建图
with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
    data_input = data_set()

    with tf.name_scope("Train"):
        train_input = DataInput(config,data_input,'train')
        with tf.variable_scope("Model", reuse = None, initializer = initializer):
            m = LSTM_LM(is_training = True, config = config, input_ = train_input)

    with tf.name_scope("Valid"):
        valid_input = DataInput(val_config,data_input,'dev')
        with tf.variable_scope("Model", reuse = True, initializer = initializer):
            mvalid = LSTM_LM(is_training = False, config = val_config, input_ = valid_input)

    with tf.name_scope("Test"):
        test_input = DataInput(eval_config,data_input,'test')
        with tf.variable_scope("Model", reuse = True, initializer = initializer):
            mtest = LSTM_LM(is_training = False, config = eval_config, input_ = test_input)
            
## 创建训练的管理器
        sv = tf.train.Supervisor()
        with sv.managed_session() as session:
            for i in range(config.max_max_epoch):    ## 需要循环训练的总次数
                lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)  ## 按照指数衰减，本实验梅西衰减为一半  
                m.assign_lr(session, config.learning_rate * lr_decay)  ## 更换学习率

                print("Epoch: %d Learning rate: %.3f" %(i + 1, session.run(m.lr)))  
                train_perplexity = run_epoch(session, m,config ,eval_op = m.train_op, verbose = True)  ## 训练
                print("Epoch: %d Train Perplexity: %.3f" %(i + 1, train_perplexity))  
                valid_perplexity = run_epoch(session, mvalid, val_config)  ## 验证
                print("Epoch: %d valid Perplexity: %.3f" %(i + 1, valid_perplexity))

            test_perplexity = run_epoch(session, mtest, eval_config)      ## 测试
            print("Test Perplexity: %.3f" %test_perplexity)

INFO:tensorflow:Starting standard services.
INFO:tensorflow:Starting queue runners.
Epoch: 1 Learning rate: 1.000
0.000 perplexity: 13.822 speed : 11 wps
0.053 perplexity: 14.244 speed : 13 wps
0.105 perplexity: 14.141 speed : 14 wps
0.158 perplexity: 14.296 speed : 15 wps
0.211 perplexity: 14.273 speed : 16 wps
0.263 perplexity: 14.447 speed : 16 wps
0.316 perplexity: 14.198 speed : 17 wps
0.368 perplexity: 14.411 speed : 18 wps
0.421 perplexity: 14.311 speed : 19 wps
0.474 perplexity: 14.240 speed : 19 wps
0.526 perplexity: 14.155 speed : 19 wps
0.579 perplexity: 14.138 speed : 19 wps
0.632 perplexity: 14.086 speed : 19 wps
0.684 perplexity: 14.116 speed : 20 wps
0.737 perplexity: 14.123 speed : 20 wps
0.789 perplexity: 13.992 speed : 20 wps
0.842 perplexity: 13.979 speed : 20 wps
0.895 perplexity: 14.031 speed : 21 wps
0.947 perplexity: 13.987 speed : 21 wps
13.9867448618
Epoch: 1 Train Perplexity: 13.987
14.5804061544
Epoch: 1 valid Perplexity: 14.580
Epoch: 2 Learning rate: 1.000
