# RNN LSTM 示例

In [11]:
import os
import sys
import datetime
import collections

import numpy as np
import tensorflow as tf

下载文件并且解压

```
wget https://dikers-data.s3.cn-northwest-1.amazonaws.com.cn/dataset/simple-examples.tgz
tar  xvf simple-examples.tgz
```

In [8]:
data_path = '../data/simple-examples/data/'

In [9]:
# 将文件根据句末分割符 <eos> 来分割
def read_words(filename):
    with tf.gfile.GFile(filename, 'r') as f:
        return f.read().replace('\n', '<eos>').split()
    
    
# 构造从单词到唯一整数值的映射
# 后面的其他数的整数值按照它们在数据集里出现的次数多少来排序，出现较多的排前面
# 单词 the 出现频次最多，对应整数值是 0
# <unk> 表示 unknown（未知），第二多，整数值为 1
def build_vocab(filename):
    data = read_words(filename)

    # 用 Counter 统计单词出现的次数，为了之后按单词出现次数的多少来排序
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))

    # 单词到整数的映射
    word_to_id = dict(zip(words, range(len(words))))

    return word_to_id

# 将文件里的单词都替换成独一的整数
def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]




In [19]:
def load_data():
    train_path = os.path.join(data_path, 'ptb.train.txt')
    valid_path = os.path.join(data_path, 'ptb.valid.txt')
    test_path = os.path.join(data_path, 'ptb.test.txt')
    
    # 建立词汇表，将所有单词（word）转为唯一对应的整数值（id）
    word_to_id = build_vocab(train_path)

    # 训练，验证和测试数据
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    test_data = file_to_word_ids(test_path, word_to_id)

    # 所有不重复单词的个数
    vocab_size = len(word_to_id)

    # 反转一个词汇表：为了之后从 整数 转为 单词
    id_to_word = dict(zip(word_to_id.values(), word_to_id.keys()))

    
#     print(word_to_id)
#     print("===================")
    print(vocab_size)
#     print("===================")
#     print(train_data[:10])
#     print("===================")
#     print(" ".join([id_to_word[x] for x in train_data[:10]]))
#     print("===================")
    print("train_data length: {}".format(len(train_data)))
    print("valid_data length: {}".format(len(valid_data)))
#     return train_data, valid_data, test_data, vocab_size, id_to_word

load_data()

10000
train_data length: 929589
valid_data length: 73760


In [1]:
class Model(object):
    def __init__(self, input_obj, is_training, hidden_size, 
                 vocab_size, num_layers, dropout=0.5, init_scale=0.05):
        self.is_training = is_training
        self.input_obj = input_obj
        self.batch_size = input_obj.batch_size
        self.num_steps = input_obj.num_steps
        self.hidden_size = hidden_size
        
        with tf.device('/cpu:0'):
            # 创建 词向量（Word Embedding），Embedding 表示 Dense Vector（密集向量）
            # 词向量本质上是一种单词聚类（Clustering）的方法
            embedding = tf.Variable(tf.random_uniform([vocab_size, self.hidden_size], -init_scale, init_scale))
            # embedding_lookup 返回词向量
            inputs = tf.nn.embedding_lookup(embedding, self.input_obj.input_data)
            
            if is_training and dropout < 1:
                inputs = tf.nn.dropout(inputs, dropout)
                