初始化參數

In [1]:
import numpy as np
import random

from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)


2024-05-26 16:02:21.766136: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [25]:
EPOCHS = 3 #20
BATCH_SIZE = 128
MAX_WORDS = 10000
READ_LINES = 60000
LAYER_SIZE = 256
EMBEDDING_WIDTH = 128
TEST_PERCENT = 0.2
SAMPLE_SIZE = 20
OOV_WORD = 'UNK'
PAD_INDEX = 0
OOV_INDEX = 1
START_INDEX = MAX_WORDS - 2
STOP_INDEX = MAX_WORDS - 1
MAX_LENGTH = 60
SRC_DEST_FILE_NAME = './data/fra.txt'

建立訓練資料處理函數

In [3]:
def read_file_combined(file_name, max_len):
    file = open(file_name, 'r', encoding='utf-8')
    src_word_sequences = []
    dest_word_sequences = []
    for i, line in enumerate(file):
        if i == READ_LINES:
            break
        pair = line.split('\t')
        word_sequence = text_to_word_sequence(pair[1])
        src_word_sequence = word_sequence[0:max_len]
        src_word_sequences.append(src_word_sequence)
        
        word_sequence = text_to_word_sequence(pair[0])
        dest_word_sequence = word_sequence[0:max_len]
        dest_word_sequences.append(dest_word_sequence)
    
    file.close()
    return src_word_sequences, dest_word_sequences
    

In [4]:
def tokenize(sequences):
    tokenizer = Tokenizer(num_words=MAX_WORDS-2, oov_token=OOV_WORD)
    tokenizer.fit_on_texts(sequences)
    token_sequences = tokenizer.texts_to_sequences(sequences)
    return tokenizer, token_sequences

def tokens_to_words(tokenizer, seq):
    word_seq = []
    for index in seq:
        if index == PAD_INDEX:
            word_seq.append('PAD')
        elif index == OOV_INDEX:
            word_seq.append(OOV_WORD)
        elif index == START_INDEX:
            word_seq.append('START')
        elif index == STOP_INDEX:
            word_seq.append('STOP')
        else:
            word_seq.append(tokenizer.sequences_to_texts([[index]])[0])
    print(word_seq)

載入訓練資料

In [5]:
src_seq, dest_seq = read_file_combined(SRC_DEST_FILE_NAME, MAX_LENGTH)
src_tokenizer, src_token_seq = tokenize(src_seq)
dest_tokenizer, dest_token_seq = tokenize(dest_seq)


In [6]:
dest_target_token_seq = [x + [STOP_INDEX] for x in dest_token_seq]
dest_input_token_seq = [[START_INDEX] + x for x in dest_target_token_seq]

src_input_data = pad_sequences(src_token_seq)
dest_input_data = pad_sequences(dest_input_token_seq, padding='post')

dest_target_data = pad_sequences(dest_target_token_seq, padding='post', maxlen=len(dest_input_data[0]))




In [7]:
print(src_seq[9999])
print(dest_seq[9999])

['quel', 'fiasco']
['what', 'a', 'fiasco']


In [8]:
print(dest_target_data[9999])
print(dest_input_data[9999])
print(src_input_data[9999])

[  35    5 3807 9999    0    0    0    0    0]
[9998   35    5 3807 9999    0    0    0    0]
[   0    0    0    0    0    0    0    0    0    0    0    0  136 6226]


切割訓練測試資料集

In [23]:
rows = len(src_input_data[:,0])
all_indices = list(range(rows))
test_rows = int(rows * TEST_PERCENT)
test_indices = random.sample(all_indices, test_rows)
train_indices = [x for x in all_indices if x not in test_indices]

train_src_input_data = src_input_data[train_indices]
train_dest_input_data = dest_input_data[train_indices]
train_dest_target_data = dest_target_data[train_indices]

test_src_input_data = src_input_data[test_indices]
test_dest_input_data = dest_input_data[test_indices]
test_dest_target_data = dest_target_data[test_indices]

test_indices = list(range(test_rows))
sample_indices = random.sample(test_indices, SAMPLE_SIZE)
sample_input_data = test_src_input_data[sample_indices]
sample_target_data = test_dest_target_data[sample_indices]

建構模型

In [18]:
# 編碼器模型
enc_embedding_input = Input(shape=(None,))

enc_embedding_layer = Embedding(output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS, mask_zero=True)
enc_layer1 = LSTM(LAYER_SIZE, return_state=True, return_sequences=True)
enc_layer2 = LSTM(LAYER_SIZE, return_state=True)

enc_embedding_layer_outputs = enc_embedding_layer(enc_embedding_input)
enc_layer1_outputs, enc_layer1_state_h, enc_layer1_state_c = enc_layer1(enc_embedding_layer_outputs) 
_, enc_layer2_state_h, enc_layer2_state_c = enc_layer2(enc_layer1_outputs)

enc_model = Model(enc_embedding_input, [enc_layer1_state_h, enc_layer1_state_c,
                                        enc_layer2_state_h, enc_layer2_state_c])

enc_model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         1280000   
                                                                 
 lstm_2 (LSTM)               [(None, None, 256),       394240    
                              (None, 256),                       
                              (None, 256)]                       
                                                                 
 lstm_3 (LSTM)               [(None, 256),             525312    
                              (None, 256),                       
                              (None, 256)]                       
                                                                 
Total params: 2,199,552
Trainable params: 2,199,552
Non-train

In [19]:
# 解碼器模型
dec_layer1_state_input_h = Input(shape=(LAYER_SIZE,))
dec_layer1_state_input_c = Input(shape=(LAYER_SIZE,))
dec_layer2_state_input_h = Input(shape=(LAYER_SIZE,))
dec_layer2_state_input_c = Input(shape=(LAYER_SIZE,))
dec_embedding_input = Input(shape=(None,))

dec_embedding_layer = Embedding(output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS, mask_zero=True)
dec_layer1 = LSTM(LAYER_SIZE, return_state=True, return_sequences=True)
dec_layer2 = LSTM(LAYER_SIZE, return_state=True, return_sequences=True)
dec_layer3 = Dense(MAX_WORDS, activation='softmax')

dec_embedding_layer_output = dec_embedding_layer(dec_embedding_input)
dec_layer1_outputs, dec_layer1_state_h, dec_layer1_state_c = dec_layer1(
    dec_embedding_layer_output, initial_state=[
        dec_layer1_state_input_h, dec_layer1_state_input_c])
dec_layer2_outputs, dec_layer2_state_h, dec_layer2_state_c = dec_layer2(
    dec_layer2(dec_layer1_outputs, initial_state=[
        dec_layer2_state_input_h, dec_layer2_state_input_c]))
dec_layer3_output = dec_layer3(dec_layer2_outputs)

dec_model = Model([dec_embedding_input, 
                   dec_layer1_state_input_h, 
                   dec_layer1_state_input_c,
                   dec_layer2_state_input_h, 
                   dec_layer2_state_input_c],
                  [dec_layer3_output, 
                   dec_layer1_state_h, dec_layer1_state_c,
                   dec_layer2_state_h, dec_layer2_state_c])
dec_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 128)    1280000     ['input_7[0][0]']                
                                                                                                  
 input_3 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 256)]        0           []                               
                                                                                            

In [21]:
# 建構並編譯整個訓練模型
train_enc_embedding_input = Input(shape=(None, ))
train_dec_embedding_input = Input(shape=(None, ))
intermediate_state = enc_model(train_enc_embedding_input)
train_dec_output, _, _, _, _ = dec_model(
    [train_dec_embedding_input] +
    intermediate_state)
training_model = Model([train_enc_embedding_input,
                        train_dec_embedding_input],
                        train_dec_output)
optimizer = RMSprop(lr=0.01)
training_model.compile(loss='sparse_categorical_crossentropy',
                       optimizer=optimizer, metrics =['accuracy'])
training_model.summary()




Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_10 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_11 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 model (Functional)             [(None, 256),        2199552     ['input_10[0][0]']               
                                 (None, 256),                                                     
                                 (None, 256),                                                     
                                 (None, 256)]                                               

In [26]:
# 訓練並測試模型
for i in range(EPOCHS):
    print('Epoch: ' , i)
    # 訓練一個週期
    history = training_model.fit(
        [train_src_input_data, train_dest_input_data], train_dest_target_data, 
        validation_data=([test_src_input_data, test_dest_input_data], test_dest_target_data), 
        epochs=1)
    # 將測試樣本送入模型
    for (test_input, test_target) in zip(sample_input_data, sample_target_data):
        x = np.reshape(test_input, (1,-1))
        last_states = enc_model.predict(x, verbose=0)
        prev_word_index = START_INDEX
        produced_string = ''
        pred_seq = []
        for j in range(MAX_LENGTH):
            x = np.reshape(np.array(prev_word_index), (1,-1))
            preds, dec_layer1_state_h, dec_layer1_state_c, dec_layer2_state_h, dec_layer2_state_c = dec_model.predict([x] + last_states, verbose=0)
            last_states = [dec_layer1_state_h, dec_layer1_state_c, 
                           dec_layer2_state_h, dec_layer2_state_c]
            # 挑出可能性最高單字
            prev_word_index = np.asarray(preds[0][0]).argmax()
            pred_seq.append(prev_word_index)
            if prev_word_index == STOP_INDEX:
                break
        tokens_to_words(src_tokenizer, test_input)
        tokens_to_words(dest_tokenizer, test_target)
        tokens_to_words(dest_tokenizer, pred_seq)
        print('\n\n')
    

Epoch:  0
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'personne', 'ne', "l'a", 'su']
['no', 'one', 'knew', 'it', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD']
['STOP']



['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'attends', 'une', 'minute']
['wait', 'a', 'minute', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
['STOP']



['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'dis', 'toujours', 'la', 'vérité']
['always', 'tell', 'the', 'truth', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD']
['STOP']



['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', "quelqu'un", 'a', 'pris', 'ma', 'place']
['someone', 'took', 'my', 'place', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD']
['STOP']



['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'tom', 'était', 'lui', 'aussi', 'excité']
['tom', 'was', 'excited', 'too', 'STOP', 'PAD', 'PAD', 'PAD', 'PAD']
['STOP']



['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',