In [1]:
def read_data():
    with open('./manghuangji.txt', 'r') as txt:
        lines = txt.readlines()
    lines = lines[10000:30000]
    # return lines
    import re
    return [
        ch for line in lines for ch in line
    ]


In [2]:
lines = read_data()

In [3]:
len(lines)

434516

In [4]:
def tokenize(lines, token='word'):
    return [list(line) if token == 'char' else line.split() for line in lines]


In [5]:
class Vocab:

    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        counter = Vocab.count_corpus(tokens)
        # 对词频率排序
        self.__token_freqs = sorted(counter.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
        self.index_to_token = ['<unk>'] + reserved_tokens
        self.token_to_index = {
            token: idx
            for idx, token in enumerate(self.index_to_token)
        }
        for token, freq in self.__token_freqs:
            if freq >= min_freq and token not in self.token_to_index:
                self.index_to_token.append(token)
                self.token_to_index[token] = len(self.index_to_token) - 1

    def __len__(self):
        return len(self.index_to_token)

    def get_tokens(self, indicates):
        if not isinstance(indicates, (list, tuple)):
            return self.index_to_token[indicates]
        return ''.join([self.get_tokens(index) for index in indicates])
    

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_index.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    @property
    def unk(self):
        return 0

    @property
    def token_freqs(self):
        return self.__token_freqs

    @staticmethod
    def count_corpus(tokens):
        if isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        from collections import Counter
        return Counter(tokens)

In [6]:
tokens = tokenize(lines)
vocab = Vocab(tokens)

In [7]:
vocab.token_freqs

[('，', 17087),
 ('。', 12933),
 ('的', 9091),
 ('一', 7390),
 ('“', 6262),
 ('”', 6255),
 ('纪', 5709),
 ('是', 5676),
 ('了', 5089),
 ('宁', 4096),
 ('道', 3462),
 ('这', 3192),
 ('人', 3142),
 ('不', 2852),
 ('…', 2829),
 ('大', 2631),
 ('在', 2628),
 ('我', 2594),
 ('有', 2522),
 ('着', 2510),
 ('子', 2345),
 ('！', 2262),
 ('也', 2259),
 ('他', 2228),
 ('个', 2181),
 ('都', 2001),
 ('剑', 1937),
 ('就', 1930),
 ('可', 1897),
 ('那', 1867),
 ('来', 1826),
 ('中', 1811),
 ('到', 1719),
 ('你', 1680),
 ('看', 1614),
 ('们', 1585),
 ('山', 1507),
 ('出', 1505),
 ('上', 1495),
 ('？', 1472),
 ('法', 1446),
 ('之', 1395),
 ('能', 1382),
 ('黑', 1375),
 ('天', 1361),
 ('神', 1264),
 ('然', 1259),
 ('阵', 1237),
 ('府', 1228),
 ('白', 1215),
 ('下', 1191),
 ('要', 1145),
 ('真', 1143),
 ('去', 1139),
 ('自', 1133),
 ('头', 1117),
 ('时', 1089),
 ('师', 1076),
 ('力', 1050),
 ('龙', 1037),
 ('些', 1021),
 ('过', 1019),
 ('地', 962),
 ('修', 961),
 ('百', 957),
 ('前', 944),
 ('仙', 939),
 ('、', 934),
 ('便', 927),
 ('而', 917),
 ('如', 917),
 ('还', 896),


In [8]:
import tensorflow as tf


def load_corpus():
    lines = read_data()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    corpus = [vocab[token] for line in tokens for token in line]
    return corpus, vocab


def seq_data_iter_random(corpus, batch_size, num_steps):  #@save
    """使用随机抽样生成一个小批量子序列"""
    # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
    import random
    corpus = corpus[random.randint(0, num_steps - 1):]
    # 减去1，是因为我们需要考虑标签
    num_subseqs = (len(corpus) - 1) // num_steps
    # 长度为num_steps的子序列的起始索引
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # 在随机抽样的迭代过程中，
    # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
    random.shuffle(initial_indices)

    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return corpus[pos:pos + num_steps]

    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        # 在这里，initial_indices包含子序列的随机起始索引
        initial_indices_per_batch = initial_indices[i:i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield tf.constant(X), tf.constant(Y)


In [9]:
corpus, vocab = load_corpus()
BATCH_SIZE = 200
NUM_STEPS = 256
train_iter = seq_data_iter_random(corpus=corpus,
                                  batch_size=BATCH_SIZE,
                                  num_steps=NUM_STEPS)


In [10]:
class RNN:

    def __init__(self, vocab_size, hiddens, batch_size):
        # rnn模型输入和输出单元是一样的，都是词表的大小
        self.vocab_size = vocab_size
        self.hiddens = hiddens
        self.batch_size = batch_size
        inputs = outputs = vocab_size

        def normal(shape):
            return tf.random.normal(shape=shape,
                                    stddev=.01,
                                    mean=0,
                                    dtype='float32')

        # 隐藏层参数
        self.w_xh = tf.Variable(normal((inputs, hiddens)), dtype='float32')
        self.w_hh = tf.Variable(normal((hiddens, hiddens)), dtype='float32')
        self.b_h = tf.Variable(normal((hiddens, )), dtype='float32')
        # 输出层参数
        self.w_hq = tf.Variable(normal((hiddens, outputs)), dtype='float32')
        self.b_q = tf.Variable(normal((outputs, )), dtype='float32')
        self.params = [self.w_xh, self.w_hh, self.b_h, self.w_hq, self.b_q]

    def init_state(self, batch_size):
        return (tf.zeros(shape=(batch_size, self.hiddens), dtype='float32'), )

    def __call__(self, x, state):
        x = tf.one_hot(tf.transpose(x), self.vocab_size)
        x = tf.cast(x, dtype='float32')
        y = []
        state, = state
        for x_ in x:
            x_ = tf.reshape(x_, [-1, self.w_xh.shape[0]])
            # state = tf.nn.relu(tf.matmul(x_, self.w_xh) + tf.matmul(state, self.w_hh) + self.b_h)
            state = tf.tanh(
                tf.matmul(x_, self.w_xh) + tf.matmul(state, self.w_hh) +
                self.b_h)
            y_ = tf.matmul(state, self.w_hq) + self.b_q
            y.append(y_)
        return tf.concat(y, axis=0), (state, )

    def predict(self, x, n_pred, vocab: Vocab):
        pred_state = self.init_state(batch_size=1)
        y = [vocab[x[0]]]
        # 先预热state，其实就是记录x的隐藏状态
        for y_ in x[1:]:
            _, pred_state = self.__call__(
                tf.reshape(tf.constant(y[-1]), (1, 1)).numpy(), pred_state)
            y.append(vocab[y_])
        # 开始利用x的隐藏状态进行预测
        for _ in range(n_pred):
            y_, pred_state = self.__call__(
                tf.reshape(tf.constant(y[-1]), (1, 1)).numpy(), pred_state)
            y.append(y_.numpy().argmax(axis=1)[0])
        return ''.join([vocab.index_to_token[c] for c in y])

    def gradient_clip(self, grads, theta):
        """
            梯度裁剪，防止梯度爆炸问题
        """
        theta = tf.constant(theta, dtype="float32")
        new_grads = []
        for grad in grads:
            new_grads.append(
                tf.convert_to_tensor(grad) if isinstance(
                    grad, tf.IndexedSlices) else grad)
        # L2范数
        norm = tf.math.sqrt(
            sum((tf.reduce_sum(grad**2).numpy() for grad in new_grads)))
        norm = tf.cast(norm, "float32")
        if tf.greater(norm, theta):
            for i, grad in enumerate(new_grads):
                new_grads[i] = grad * theta / norm
        return new_grads
    
    def fit(self, train_iter, strategy, epochs=10, lr=1e-3):
        with strategy.scope():
            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
        for epoch in range(epochs):
            for x, y in train_iter():
                state = self.init_state(x.shape[0])
                with tf.GradientTape(persistent=True) as gt:
                    # 向前计算
                    y_hat, state = self.__call__(x, state)
                    # y_hat = tf.nn.softmax(y_hat)
                    y = tf.reshape(tf.transpose(y), (-1))
                    # 损失计算
                    l = loss_fn(y, y_hat)
                grads = gt.gradient(l, self.params)
                grads = self.gradient_clip(grads, 1)
                # print(tf.reduce_sum(grads[0]))
                optimizer.apply_gradients(zip(grads, self.params))
            print("epoch[%d\%d loss: %f" % (epoch + 1, epochs, tf.reduce_mean(l)))


strategy = tf.distribute.OneDeviceStrategy('/gpu:0')
with strategy.scope():
    net = RNN(len(vocab), 512, BATCH_SIZE)
def get_train_iter():
    return seq_data_iter_random(corpus=corpus,
                                  batch_size=BATCH_SIZE,
                                  num_steps=NUM_STEPS)
net.fit(train_iter=get_train_iter, strategy=strategy, epochs=500, lr=1e-3)
net.predict('the time machine', 50, vocab)

2022-04-05 11:25:02.007187: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-05 11:25:02.027598: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-05 11:25:02.027702: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-05 11:25:02.028188: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

epoch[1\500 loss: 5.823314
epoch[2\500 loss: 5.786952
epoch[3\500 loss: 5.774580
epoch[4\500 loss: 5.774802
epoch[5\500 loss: 5.764059
epoch[6\500 loss: 5.711180
epoch[7\500 loss: 5.724894
epoch[8\500 loss: 5.665903
epoch[9\500 loss: 5.611476
epoch[10\500 loss: 5.532140
epoch[11\500 loss: 5.480793
epoch[12\500 loss: 5.501227
epoch[13\500 loss: 5.350513
epoch[14\500 loss: 5.344835
epoch[15\500 loss: 5.200646
epoch[16\500 loss: 5.005244
epoch[17\500 loss: 4.883674
epoch[18\500 loss: 4.754484
epoch[19\500 loss: 4.597704
epoch[20\500 loss: 4.559920
epoch[21\500 loss: 4.460526
epoch[22\500 loss: 4.348105
epoch[23\500 loss: 4.217956
epoch[24\500 loss: 4.143278
epoch[25\500 loss: 4.091426
epoch[26\500 loss: 3.988057
epoch[27\500 loss: 3.999445
epoch[28\500 loss: 3.841330
epoch[29\500 loss: 3.830931
epoch[30\500 loss: 3.754501
epoch[31\500 loss: 3.722875
epoch[32\500 loss: 3.725136
epoch[33\500 loss: 3.659016
epoch[34\500 loss: 3.565901
epoch[35\500 loss: 3.519698
epoch[36\500 loss: 3.538991
e

'the time machine于惊人。\n\n    “公子的。”\n\n    “法宝来吧？”\n\n    “有所有法门来。”黑发中年人言'

In [12]:
net.predict("纪宁是谁", 200, vocab)

'纪宁是谁在我父亲驱使法宝？\n\n    “好。”洞子启连道身，凌驾驭音速起……\n\n    “那古老水府的神魔之力在最强战神魔的才能施展。\n\n    “真人。”纪宁感觉到一旁的黑色老牛，对道：“真是很难啊。”\n\n    纪宁点头，“我也不知晓你，我这第一关路即便是我认识的却也算不得了。\n\n    “啊，你才收弟子明显，擅长岁月来也能让大己早的命力才会好。”\n\n    声音直接传说。\n\n    “族长，我也有你'