## 1. Tải dữ liệu

In [330]:
import string
import requests
import os

class Dataset():
    def __init__(self):
        self.dataPath = None
        self.data_list = None
        self.input_sentences = []
        self.target_sentences = []

    def download(self, url):
        """
        Download dataset from url and save to file
        """
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            # Ensure the dataset directory exists
            os.makedirs('dataset', exist_ok=True)
            
            self.dataPath = 'dataset/truyenkieu.txt'
        
            with open(self.dataPath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            
            print('Downloaded dataset to ', self.dataPath)
        
        except requests.exceptions.RequestException as e:
            print('Error downloading dataset:', e)
            return
    
    def load_data(self):
        """
        Read data from file and return a list of sentences
        """
        try:
            with open(self.dataPath, 'r', encoding='utf-8') as f:
                data = f.read()
            
            # separate data sentence by sentence and remove blank sentences
            self.data_list = [line for line in data.split('\n') if line != '']
            # display 10 first sentences
            # print(data_list[:10])
            return self.data_list
        
        except FileNotFoundError:
            print('File not found. Please download the dataset first.')
            return None
    
    def clean_data(self):
        """
        Clean data
        """
        # Convert to lowercase
        self.data_list = [x.lower() for x in self.data_list]
        # print(self.data_list[:10])
        
        # Remove punctuation
        remove_punc = str.maketrans('', '', string.punctuation)
        removed_punc_text = []
        
        for sent in self.data_list:
            sentence = [w.translate(remove_punc) for w in sent.split(' ')]
            removed_punc_text.append(' '.join(sentence))
        self.data_list = removed_punc_text
        # print(self.data_list[:10])
        
        # Remove digits
        remove_digits = str.maketrans('', '', string.digits)
        removed_digits_text = []
        
        for sent in self.data_list:
            sentence = [w.translate(remove_digits) for w in sent.split(' ')]
            removed_digits_text.append(' '.join(sentence))
        self.data_list = removed_digits_text
        # print(self.data_list[:10])
        
        # Remove starting and ending whitespaces
        self.data_list = [st.strip() for st in self.data_list]
        # print(self.data_list[:10])

        # Remove … and – characters
        self.data_list = [st.replace('...', '') for st in self.data_list]
        self.data_list = [st.replace('-', '') for st in self.data_list]

        # Check to see if 2 sentences are on the same line
        for ins in self.data_list:
            if len(ins.split()) > 8:
                print(ins)
        
        return self.data_list
    
    def split_data(self):
        """
        Split data into input and output sequences
        """
        for index, seq_txt in enumerate(self.data_list):
            if index % 2 == 0:
                self.input_sentences.append(seq_txt)
            else:
                self.target_sentences.append(seq_txt)
        
        self.target_sentences = ['start ' + ts + ' end' for ts in self.target_sentences]

        return self.input_sentences, self.target_sentences

#################

dataset = Dataset()
dataset.download(url=f'https://raw.githubusercontent.com/tiensu/Natural_Language_Processing/master/Text-Generation/dataset/truyenkieu.txt')
data_list = dataset.load_data()
# print(data_list[:20])
cleaned_data = dataset.clean_data()
# print(cleaned_data[:20])
input_sentences, target_sentences = dataset.split_data()
# print(input_sentences[:20])
# print(target_sentences[:20])


Downloaded dataset to  dataset/truyenkieu.txt


In [430]:
input_sentences

['trăm năm trong cõi người ta',
 'trải qua một cuộc bể dâu',
 'lạ gì bỉ sắc tư phong',
 'cảo thơm lần giở trước đèn',
 'rằng năm gia tĩnh triều minh',
 'có nhà viên ngoại họ vương',
 'một trai con thứ rốt lòng',
 'đầu lòng hai ả tố nga',
 'mai cốt cách tuyết tinh thần',
 'vân xem trang trọng khác vời',
 'hoa cười ngọc thốt đoan trang',
 'kiều càng sắc sảo mặn mà',
 'làn thu thủy nét xuân sơn',
 'một hai nghiêng nước nghiêng thành',
 'thông minh vốn sẵn tư trời',
 'cung thương làu bậc ngũ âm',
 'khúc nhà tay lựa nên chương',
 'phong lưu rất mực hồng quần',
 'êm đềm trướng rủ màn che',
 'ngày xuân con én đưa thoi',
 'cỏ non xanh tận chân trời',
 'thanh minh trong tiết tháng ba',
 'gần xa nô nức yến anh',
 'dập dìu tài tử giai nhân',
 'ngổn ngang gò đống kéo lên',
 'tà tà bóng ngả về tây',
 'bước dần theo ngọn tiểu khê',
 'nao nao dòng nước uốn quanh',
 'sè sè nấm đất bên đàng',
 'rằng sao trong tiết thanh minh',
 'vương quan mới dẫn gần xa',
 'nổi danh tài sắc một thì',
 'kiếp hồng nhan 

In [385]:
print(input_sentences[0])

trăm năm trong cõi người ta


In [389]:
print(target_sentences[0])

start chữ tài chữ mệnh khéo là ghét nhau end


## 2. Thêm các thư viện cần thiết

In [333]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku
import numpy as np

## 3. Xây dựng tokenizer

In [334]:
tokenizer = Tokenizer(filters='', oov_token='<UNK>')
tokenizer.fit_on_texts(input_sentences + target_sentences)

special_tokens = ['<start>', '<end>']
for token in special_tokens:
    if token not in tokenizer.word_index:
        index = len(tokenizer.word_index) + 1
        tokenizer.word_index[token] = index
        tokenizer.index_word[index] = token

In [335]:
input_sequences = tokenizer.texts_to_sequences(input_sentences)
target_sequences = tokenizer.texts_to_sequences(target_sentences)

In [336]:
# Thêm ký tự bắt đầu và kết thúc câu vào các target sequences
start_token = tokenizer.word_index['<start>'] if '<start>' in tokenizer.word_index else len(tokenizer.word_index) + 1
end_token = tokenizer.word_index['<end>'] if '<end>' in tokenizer.word_index else len(tokenizer.word_index) + 2

if '<start>' not in tokenizer.word_index:
    tokenizer.word_index['<start>'] = start_token
    tokenizer.index_word[start_token] = '<start>'
    
if '<end>' not in tokenizer.word_index:
    tokenizer.word_index['<end>'] = end_token
    tokenizer.index_word[end_token] = '<end>'

target_sequences = [[start_token] + seq + [end_token] for seq in target_sequences]

In [337]:
max_seq_length = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_seq_length, padding='post')


In [338]:
# Chuyển đổi target sequences thành array
target_sequences = np.array(target_sequences)

In [339]:
target_sequences

array([[2415,    2,  132, ...,   86,    3, 2416],
       [2415,    2,   56, ...,    8,    3, 2416],
       [2415,    2,   35, ...,  597,    3, 2416],
       ...,
       [2415,    2,   11, ...,   45,    3, 2416],
       [2415,    2,  132, ...,  130,    3, 2416],
       [2415,    2,  798, ...,  317,    3, 2416]])

In [340]:
# Xây dựng từ điển từ và số
vocab_size = len(tokenizer.word_index) + 1

In [341]:
vocab_size

2417

In [342]:
tokenizer.word_index

{'<UNK>': 1,
 'start': 2,
 'end': 3,
 'một': 4,
 'đã': 5,
 'người': 6,
 'nàng': 7,
 'lòng': 8,
 'lời': 9,
 'cho': 10,
 'cũng': 11,
 'là': 12,
 'có': 13,
 'rằng': 14,
 'ra': 15,
 'lại': 16,
 'hoa': 17,
 'tình': 18,
 'mới': 19,
 'còn': 20,
 'đâu': 21,
 'ai': 22,
 'chẳng': 23,
 'mà': 24,
 'thì': 25,
 'mình': 26,
 'biết': 27,
 'này': 28,
 'trong': 29,
 'đến': 30,
 'đường': 31,
 'nhà': 32,
 'càng': 33,
 'nào': 34,
 'trời': 35,
 'ngày': 36,
 'thân': 37,
 'như': 38,
 'khi': 39,
 'vào': 40,
 'mặt': 41,
 'sao': 42,
 'vàng': 43,
 'duyên': 44,
 'xa': 45,
 'về': 46,
 'tay': 47,
 'sinh': 48,
 'làm': 49,
 'chàng': 50,
 'thôi': 51,
 'trước': 52,
 'chi': 53,
 'thấy': 54,
 'nghe': 55,
 'những': 56,
 'sau': 57,
 'hai': 58,
 'nỗi': 59,
 'từ': 60,
 'nước': 61,
 'hương': 62,
 'nói': 63,
 'xuân': 64,
 'trông': 65,
 'hồng': 66,
 'phải': 67,
 'ta': 68,
 'con': 69,
 'thương': 70,
 'gió': 71,
 'đây': 72,
 'thế': 73,
 'tiếng': 74,
 'chưa': 75,
 'mấy': 76,
 'tơ': 77,
 'ở': 78,
 'năm': 79,
 'với': 80,
 'chút': 81,

## 4. Xây Dựng Model

In [343]:
x = input_sequences
y = np.expand_dims(target_sequences, -1)

In [344]:
max_seq_length

12

In [345]:
x.shape

(1629, 12)

In [346]:
y.shape

(1629, 12, 1)

In [347]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from keras import regularizers

In [348]:
print("Vocabulary size:", vocab_size)
print("Token indices range in input sequences:", np.min(input_sequences), np.max(input_sequences))
print("Token indices range in target sequences:", np.min(target_sequences), np.max(target_sequences))

Vocabulary size: 2417
Token indices range in input sequences: 0 2056
Token indices range in target sequences: 2 2416


In [349]:
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim = 100, input_shape = (max_sequence_len, )))

model.add(Bidirectional(LSTM(150, return_sequences = True)))

model.add(Dropout(0.2))

model.add(LSTM(100, return_sequences = True))

model.add(Dense(vocab_size, activation='softmax'))

# Compile mô hình
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

print(model.summary())


None


In [409]:
history = model.fit(x, y, epochs=50, batch_size=64)

Epoch 1/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 0.1339
Epoch 2/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 0.1353
Epoch 3/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 0.1280
Epoch 4/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 0.1266
Epoch 5/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - loss: 0.1292
Epoch 6/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - loss: 0.1206
Epoch 7/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 0.1179
Epoch 8/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 0.1211
Epoch 9/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 0.1188
Epoch 10/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step - loss: 0.1186

Tiến hành training

### 5. Dự đoán 10 từ tiếp theo

In [351]:
def generate_poem(input_text):
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post')
    
    output_sequence = [start_token]
    for _ in range(max_seq_length - 1):
        padded_output_sequence = pad_sequences([output_sequence], maxlen=max_seq_length, padding='post')
        
        predictions = model.predict([input_sequence, padded_output_sequence])
        predicted_id = np.argmax(predictions[0, len(output_sequence) - 1])
        
        if predicted_id == end_token:
            break
        
        output_sequence.append(predicted_id)
    
    # Ensure start and end tokens are removed from the output text
    output_sequence = [id for id in output_sequence if id not in [start_token, end_token]]
    
    output_text = ' '.join([tokenizer.index_word.get(id, '') for id in output_sequence])
    return output_text.strip()

Câu mồi

In [459]:
input_text = "hạ về xanh biếc trên sông"

In [460]:
output_text = generate_poem(input_text)
print(output_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
start đầu quân trâu bóng thoả năng bước ra end
