In [13]:
def read_data():
    with open('./the_time_machine.txt', 'r') as txt:
        lines = txt.readlines()
    import re
    return [
        l for l in
        [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
        if l.strip() != ''
    ]


In [14]:
lines = read_data()

In [15]:
len(lines)

3093

In [16]:
def tokenize(lines, token='word'):
    return [list(line) if token == 'char' else line.split() for line in lines]


In [17]:
class Vocab:

    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        counter = Vocab.count_corpus(tokens)
        # 对词频率排序
        self.__token_freqs = sorted(counter.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
        self.index_to_token = ['<unk>'] + reserved_tokens
        self.token_to_index = {
            token: idx
            for idx, token in enumerate(self.index_to_token)
        }
        for token, freq in self.__token_freqs:
            if freq >= min_freq and token not in self.token_to_index:
                self.index_to_token.append(token)
                self.token_to_index[token] = len(self.index_to_token) - 1

    def __len__(self):
        return len(self.index_to_token)

    def get_tokens(self, indicates):
        if not isinstance(indicates, (list, tuple)):
            return self.index_to_token[indicates]
        return ''.join([self.get_tokens(index) for index in indicates])
    

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_index.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    @property
    def unk(self):
        return 0

    @property
    def token_freqs(self):
        return self.__token_freqs

    @staticmethod
    def count_corpus(tokens):
        if isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        from collections import Counter
        return Counter(tokens)

In [18]:
tokens = tokenize(lines)
vocab = Vocab(tokens)

In [19]:
vocab.token_freqs

[('the', 2477),
 ('and', 1312),
 ('of', 1286),
 ('i', 1268),
 ('a', 877),
 ('to', 766),
 ('in', 606),
 ('was', 554),
 ('that', 458),
 ('it', 452),
 ('my', 441),
 ('had', 354),
 ('as', 281),
 ('me', 281),
 ('with', 264),
 ('at', 257),
 ('for', 247),
 ('you', 212),
 ('time', 211),
 ('but', 209),
 ('this', 199),
 ('or', 162),
 ('were', 158),
 ('on', 148),
 ('not', 142),
 ('from', 137),
 ('all', 136),
 ('then', 134),
 ('is', 129),
 ('have', 129),
 ('his', 129),
 ('there', 128),
 ('by', 126),
 ('he', 126),
 ('they', 124),
 ('one', 120),
 ('upon', 115),
 ('so', 114),
 ('into', 114),
 ('little', 114),
 ('be', 112),
 ('came', 107),
 ('no', 102),
 ('gutenberg', 98),
 ('some', 95),
 ('machine', 93),
 ('could', 93),
 ('an', 92),
 ('which', 92),
 ('we', 91),
 ('their', 91),
 ('said', 89),
 ('project', 88),
 ('saw', 88),
 ('down', 87),
 ('s', 86),
 ('very', 86),
 ('them', 86),
 ('now', 79),
 ('what', 78),
 ('these', 77),
 ('about', 77),
 ('any', 75),
 ('been', 75),
 ('her', 75),
 ('up', 74),
 ('out

In [20]:
tokens[666], vocab[tokens[666]]

(['that',
  'i',
  'noticed',
  'for',
  'the',
  'first',
  'time',
  'how',
  'warm',
  'the',
  'air',
  'was'],
 [9, 4, 518, 17, 1, 98, 19, 104, 698, 1, 199, 8])

In [21]:
import tensorflow as tf


def load_corpus():
    lines = read_data()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    corpus = [vocab[token] for line in tokens for token in line]
    return corpus, vocab


def seq_data_iter_random(corpus, batch_size, num_steps):  #@save
    """使用随机抽样生成一个小批量子序列"""
    # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
    import random
    corpus = corpus[random.randint(0, num_steps - 1):]
    # 减去1，是因为我们需要考虑标签
    num_subseqs = (len(corpus) - 1) // num_steps
    # 长度为num_steps的子序列的起始索引
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # 在随机抽样的迭代过程中，
    # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
    random.shuffle(initial_indices)

    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return corpus[pos:pos + num_steps]

    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        # 在这里，initial_indices包含子序列的随机起始索引
        initial_indices_per_batch = initial_indices[i:i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield tf.constant(X), tf.constant(Y)


In [22]:
corpus, vocab = load_corpus()
BATCH_SIZE = 100
NUM_STEPS = 60
train_iter = seq_data_iter_random(corpus=corpus,
                                  batch_size=BATCH_SIZE,
                                  num_steps=NUM_STEPS)


In [24]:
class RNN:

    def __init__(self, vocab_size, hiddens, batch_size):
        # rnn模型输入和输出单元是一样的，都是词表的大小
        self.vocab_size = vocab_size
        self.hiddens = hiddens
        self.batch_size = batch_size
        inputs = outputs = vocab_size

        def normal(shape):
            return tf.random.normal(shape=shape,
                                    stddev=.01,
                                    mean=0,
                                    dtype='float32')

        # 隐藏层参数
        self.w_xh = tf.Variable(normal((inputs, hiddens)), dtype='float32')
        self.w_hh = tf.Variable(normal((hiddens, hiddens)), dtype='float32')
        self.b_h = tf.Variable(normal((hiddens, )), dtype='float32')
        # 输出层参数
        self.w_hq = tf.Variable(normal((hiddens, outputs)), dtype='float32')
        self.b_q = tf.Variable(normal((outputs, )), dtype='float32')
        self.params = [self.w_xh, self.w_hh, self.b_h, self.w_hq, self.b_q]

    def init_state(self, batch_size):
        return (tf.zeros(shape=(batch_size, self.hiddens), dtype='float32'), )

    def __call__(self, x, state):
        x = tf.one_hot(tf.transpose(x), self.vocab_size)
        x = tf.cast(x, dtype='float32')
        y = []
        state, = state
        for x_ in x:
            x_ = tf.reshape(x_, [-1, self.w_xh.shape[0]])
            # state = tf.nn.relu(tf.matmul(x_, self.w_xh) + tf.matmul(state, self.w_hh) + self.b_h)
            state = tf.tanh(
                tf.matmul(x_, self.w_xh) + tf.matmul(state, self.w_hh) +
                self.b_h)
            y_ = tf.matmul(state, self.w_hq) + self.b_q
            y.append(y_)
        return tf.concat(y, axis=0), (state, )

    def predict(self, x, n_pred, vocab: Vocab):
        pred_state = self.init_state(batch_size=1)
        y = [vocab[x[0]]]
        # 先预热state，其实就是记录x的隐藏状态
        for y_ in x[1:]:
            _, pred_state = self.__call__(
                tf.reshape(tf.constant(y[-1]), (1, 1)).numpy(), pred_state)
            y.append(vocab[y_])
        # 开始利用x的隐藏状态进行预测
        for _ in range(n_pred):
            y_, pred_state = self.__call__(
                tf.reshape(tf.constant(y[-1]), (1, 1)).numpy(), pred_state)
            y.append(y_.numpy().argmax(axis=1)[0])
        print(y)
        return ''.join([vocab.index_to_token[c] for c in y])

    def gradient_clip(self, grads, theta):
        """
            梯度裁剪，防止梯度爆炸问题
        """
        theta = tf.constant(theta, dtype="float32")
        new_grads = []
        for grad in grads:
            new_grads.append(
                tf.convert_to_tensor(grad) if isinstance(
                    grad, tf.IndexedSlices) else grad)
        # L2范数
        norm = tf.math.sqrt(
            sum((tf.reduce_sum(grad**2).numpy() for grad in new_grads)))
        norm = tf.cast(norm, "float32")
        if tf.greater(norm, theta):
            for i, grad in enumerate(new_grads):
                new_grads[i] = grad * theta / norm
        return new_grads
    
    def fit(self, train_iter, strategy, epochs=10, lr=1e-3):
        with strategy.scope():
            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
        for epoch in range(epochs):
            for x, y in train_iter():
                state = self.init_state(x.shape[0])
                with tf.GradientTape(persistent=True) as gt:
                    # 向前计算
                    y_hat, state = self.__call__(x, state)
                    # y_hat = tf.nn.softmax(y_hat)
                    y = tf.reshape(tf.transpose(y), (-1))
                    # 损失计算
                    l = loss_fn(y, y_hat)
                grads = gt.gradient(l, self.params)
                grads = self.gradient_clip(grads, 1)
                # print(tf.reduce_sum(grads[0]))
                optimizer.apply_gradients(zip(grads, self.params))
            print("epoch[%d\%d loss: %f" % (epoch + 1, epochs, tf.reduce_mean(l)))


strategy = tf.distribute.OneDeviceStrategy('/gpu:0')
with strategy.scope():
    net = RNN(len(vocab), 512, BATCH_SIZE)
def get_train_iter():
    return seq_data_iter_random(corpus=corpus,
                                  batch_size=BATCH_SIZE,
                                  num_steps=NUM_STEPS)
net.fit(train_iter=get_train_iter, strategy=strategy, epochs=200, lr=1e-3)
net.predict('the time machine', 50, vocab)

epoch[1\200 loss: 2.796890
epoch[2\200 loss: 2.591876
epoch[3\200 loss: 2.472374
epoch[4\200 loss: 2.336780
epoch[5\200 loss: 2.297394
epoch[6\200 loss: 2.267097
epoch[7\200 loss: 2.213108
epoch[8\200 loss: 2.151151
epoch[9\200 loss: 2.151937
epoch[10\200 loss: 2.182924
epoch[11\200 loss: 2.168819
epoch[12\200 loss: 2.159393
epoch[13\200 loss: 2.135829
epoch[14\200 loss: 2.116175
epoch[15\200 loss: 2.106505
epoch[16\200 loss: 2.102724
epoch[17\200 loss: 2.091781
epoch[18\200 loss: 2.043946
epoch[19\200 loss: 2.084870
epoch[20\200 loss: 2.016462
epoch[21\200 loss: 2.015875
epoch[22\200 loss: 2.030103
epoch[23\200 loss: 1.992298
epoch[24\200 loss: 1.979326
epoch[25\200 loss: 1.975075
epoch[26\200 loss: 1.916403
epoch[27\200 loss: 1.922043
epoch[28\200 loss: 1.964500
epoch[29\200 loss: 1.904190
epoch[30\200 loss: 1.882347
epoch[31\200 loss: 1.844725
epoch[32\200 loss: 1.852497
epoch[33\200 loss: 1.840168
epoch[34\200 loss: 1.827344
epoch[35\200 loss: 1.809529
epoch[36\200 loss: 1.792881
e

'the time machine and such others lives and have manqueemmittelt th'

In [26]:
net.predict("fyang", 1024, vocab)

[16, 19, 4, 7, 18, 12, 2, 4, 3, 5, 13, 2, 1, 3, 5, 13, 2, 1, 5, 7, 1, 3, 10, 2, 1, 4, 7, 11, 1, 3, 6, 1, 12, 2, 4, 9, 7, 1, 6, 22, 2, 9, 1, 12, 4, 7, 18, 2, 9, 1, 17, 4, 8, 1, 5, 12, 12, 1, 15, 6, 7, 3, 2, 5, 7, 3, 12, 19, 1, 3, 10, 2, 1, 2, 4, 9, 3, 10, 1, 8, 2, 2, 1, 5, 7, 1, 4, 1, 20, 4, 5, 7, 1, 6, 7, 1, 3, 10, 2, 1, 2, 22, 2, 7, 1, 7, 6, 17, 17, 10, 4, 3, 1, 3, 10, 2, 1, 3, 10, 5, 7, 18, 1, 5, 1, 10, 4, 22, 2, 1, 8, 4, 5, 11, 1, 10, 4, 22, 2, 1, 3, 10, 2, 13, 1, 5, 7, 1, 5, 3, 8, 3, 9, 4, 7, 18, 2, 1, 21, 2, 2, 7, 4, 7, 1, 12, 5, 3, 3, 12, 2, 1, 3, 5, 13, 2, 1, 5, 1, 8, 4, 17, 1, 3, 10, 2, 13, 2, 11, 11, 5, 7, 18, 1, 4, 7, 11, 1, 3, 10, 2, 1, 3, 10, 5, 7, 18, 1, 17, 4, 8, 1, 7, 6, 3, 1, 14, 7, 12, 5, 23, 2, 1, 4, 1, 27, 14, 2, 8, 3, 5, 6, 7, 1, 4, 11, 6, 7, 1, 3, 10, 2, 1, 6, 3, 10, 2, 9, 1, 3, 10, 2, 1, 17, 6, 9, 23, 1, 5, 7, 1, 4, 7, 19, 1, 5, 13, 20, 9, 2, 8, 8, 5, 6, 7, 1, 6, 16, 1, 13, 4, 7, 19, 2, 12, 11, 6, 8, 3, 1, 20, 6, 9, 14, 8, 3, 1, 17, 4, 8, 1, 16, 12, 4, 13, 2, 1, 5

'fyangleatime time in the and to learn over langer was ill conteintly the earth see in a pain on the even nowwhat the thing i have said have them in itstrange beenan little time i saw themedding and the thing was not unlike a question adon the other the work in any impression of manyeldost porust was flame in the doorway as if he had beendazzled by the light then he coming sapertare was the only teedfut and the same pishent to pretaydinaperfection absuld this perfectly sand there were no sack to white marking as the morlocks pooked against the hail in a turned onl the gallery was in fact an the three place had beenassatised medical man are you pain and the thing as to understand the gallery was the new sons gond the full project gutenberg tm work and creature perhaps a little stopping was bofely atrictly of the distance freedod sounds to she well of an anemotion and the firing projectgutenberg tm electronic works in yourpossession of me and so i noticed that the projectgutenberg litera