In [1]:
import tensorflow as tf
import numpy as np
print(tf.version.VERSION)

In [2]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [3]:
text1 = open('../input/mydata/Crimson Sabre.txt',encoding='utf-8').read()
text2= open('../input/mydata/Island of No return-Xia Ko Shin-Hap Kak Hang.txt', encoding='utf-8').read()
text3 =open('../input/mydata/SheDioYiXiongChuan.txt',encoding='utf-8').read()
text4 =open('../input/mydata/TeinLongBaBu.txt',encoding='utf-8').read()
text5 = open('../input/mydata/The Flying Fox of Snowy Mountain.txt',encoding='utf-8').read()
text6= open('../input/mydata/White Horse Neighing in the West Wind.txt', encoding='utf-8').read()
text7 =open('../input/mydata/XIaoaoginhu.txt',encoding='utf-8').read()
text8 =open('../input/mydata/yiteintulonggy.txt',encoding='utf-8').read()
text= text1 + text2 +text3 +text4+ text5 + text6 +text7 +text8

In [4]:
w = len(set(text))
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=w,char_level=True,filters='')
tokenizer.fit_on_texts(text)


In [5]:
# 方便說明，實際上我們會用更大的值來讓模型從更長的序列預測下個中文字
SEQ_LENGTH = 10  # 數字序列長度
BATCH_SIZE = 128 # 幾筆成對輸入/輸出
text_as_int = tokenizer.texts_to_sequences([text])[0]

# 我們利用 from_tensor_slices 將其轉變成 TensorFlow 最愛的 Tensor
characters = tf.data.Dataset.from_tensor_slices(text_as_int)

print(type(characters))
# 將被以數字序列表示的天龍八部文本拆成多個長度為 (SEQ_LENGTH(10)+1) 的序列
# 並將最後長度不滿 SEQ_LENGTH 的序列捨去
sequences = characters.batch(SEQ_LENGTH + 1,drop_remainder=True)
print(sequences)

In [6]:
# 天龍八部全文所包含的成對輸入/輸出的數量
steps_per_epoch = \
    len(text_as_int) // SEQ_LENGTH
print("成對輸入輸出數量:",steps_per_epoch)
# 成對輸入輸出數量 414632 （4146323/10=414632）

# 這個函式專門負責把一個序列拆成兩個序列，分別代表輸入與輸出
def build_seq_pairs(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text
# 將每個從文本擷取出來的序列套用上面定義的函式，拆成兩個數字序列
# 作為輸入／輸出序列再將得到的所有數據隨機打亂順序最後再一次拿出 BATCH_SIZE（128）筆數據
# ds作為模型一次訓練步驟的所使用的資料
ds = sequences\
    .map(build_seq_pairs)\
    .shuffle(steps_per_epoch)\
    .batch(BATCH_SIZE, 
           drop_remainder=True)

print("ds.map:",ds.map)
print("ds.map 取第一個值：",ds.take(1))

In [7]:
# 超參數
EMBEDDING_DIM = 512
RNN_UNITS = 1024
LEARNING_RATE = 0.001

In [8]:
# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=w,output_dim=EMBEDDING_DIM,batch_input_shape=[BATCH_SIZE, None]))

    # LSTM層，負責將序列數據依序讀入並做處理  原本在GPU上是設stateful=true, TPU上設為空
    model.add(tf.keras.layers.LSTM(units=RNN_UNITS, return_sequences=True, stateful='', recurrent_initializer='glorot_uniform'))

    model.add(tf.keras.layers.Dense(w))

    def loss(y_true, y_pred):
        return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss=loss)

In [9]:
EPOCHS = 200 # 決定看幾篇天龍八部文本
history = model.fit(
    ds, # 前面使用 tf.data 建構的資料集
    epochs=EPOCHS
)

In [11]:
model.save("myModel_8books_10Words_200Epochs.h5")