In [1]:
import numpy as np
import tensorflow as tf

Step1: 对于文本的读取以及预处理

In [2]:
## 读取text文档
f = open('peotryFromTang.txt',encoding = 'gbk')
lines = f.readlines()
f.close()

In [3]:
## 预处理一：去除字符串中间的'\n',合并同一首诗的字符串
peotry = []
item = ''
for i in range(len(lines)):
    if (lines[i] != '\n'):
        item += lines[i].replace('\n','')
    else:
        peotry.append(item)
        item = ''

peotry.remove('')

In [9]:
class Poetry:

    def __init__(self):
        self.poetry_file = 'peotryFromTang.txt'
        self.poetry_list = self._get_poetry()
        self.poetry_vectors, self.word_to_int, self.int_to_word = self._gen_poetry_vectors()
        self.batch_size = 64
        self.chunk_size = len(self.poetry_vectors) // self.batch_size

    def _get_poetry(self):
        with open(self.poetry_file, "r", encoding='gbk') as f:
            poetry_list = [line for line in f]
        return poetry_list

    def _gen_poetry_vectors(self):
        words = sorted(set(''.join(self.poetry_list)+' '))  ## 将词提取出来
        # 每一个字符分配一个索引 为后续诗词向量化做准备
        int_to_word = {i: word for i, word in enumerate(words)}  ## 正向词典
        word_to_int = {v: k for k, v in int_to_word.items()}     ## 反向词典
        to_int = lambda word: word_to_int.get(word)
        poetry_vectors = [list(map(to_int, poetry)) for poetry in self.poetry_list]
        return poetry_vectors, word_to_int, int_to_word

    def batch(self):
        # 生成器
        start = 0
        end = self.batch_size
        for _ in range(self.chunk_size):
            batches = self.poetry_vectors[start:end]
            # 输入数据 按每块数据中诗句最大长度初始化数组，缺失数据补全
            x_batch = np.full((self.batch_size, max(map(len, batches))), self.word_to_int[' '], np.int32)
            for row in range(self.batch_size): x_batch[row, :len(batches[row])] = batches[row]
            # 标签数据 根据上一个字符预测下一个字符 所以这里y_batch数据应为x_batch数据向后移一位
            y_batch = np.copy(x_batch)
            y_batch[:, :-1], y_batch[:, -1] = x_batch[:, 1:], x_batch[:, 0]
            yield x_batch, y_batch
            start += self.batch_size
            end += self.batch_size

In [17]:
if __name__ == '__main__':
    data = Poetry().batch()
    for x, y in data:
        print(x)

[[   0    1    1 ...    1    1    1]
 [ 689  639   20 ... 2513    0    1]
 [ 495  310 2440 ...   13    0    1]
 ...
 [1293 1198  206 ...   13    0    1]
 [   0    1    1 ...    1    1    1]
 [1689 1909  843 ...   13    0    1]]
[[ 329  714 2440 ...    1    1    1]
 [ 374  863  100 ...   13    0    1]
 [ 776 1436  336 ...    0    1    1]
 ...
 [   0    1    1 ...    1    1    1]
 [1049  992  960 ...   13    0    1]
 [2192 1131  996 ...   13    0    1]]
[[2240 1431  365 ...   13    0    1]
 [   0    1    1 ...    1    1    1]
 [1194  610  877 ...    1    1    1]
 ...
 [2248  639 1546 ...   13    0    1]
 [ 641 1071  143 ...    1    1    1]
 [1543  570 1982 ...    1    1    1]]
[[2138 1187 1844 ...   13    0    1]
 [ 569 1813  895 ...    1    1    1]
 [   0    1    1 ...    1    1    1]
 ...
 [1538 1250 1249 ...   13    0    1]
 [   0    1    1 ...    1    1    1]
 [ 483 1043   22 ...   13    0    1]]
[[1145  245 1197 ...   13    1    0]
 [   0    1    1 ...    1    1    1]
 [1426 1636 22

In [19]:
with open('peotryFromTang.txt', "r", encoding='gbk') as f:
    poetry_list = [line for line in f]

In [32]:
words = sorted(set(''.join(poetry_list)+' '))

In [34]:
int_to_word = {i: word for i, word in enumerate(words)}

In [36]:
word_to_int = {v: k for k, v in int_to_word.items()}

In [40]:
to_int = lambda word: word_to_int.get(word)

In [41]:
to_int

<function __main__.<lambda>(word)>

In [43]:
poetry_vectors = [list(map(to_int, poetry)) for poetry in poetry_list]

In [47]:
word_to_int.get('O')

4