# **Import modules and data file**

In [5]:
from keras.callbacks import Callback, ModelCheckpoint, TensorBoard
from keras.layers import Dense, Dropout, Embedding, LSTM, TimeDistributed, Flatten
from keras.models import load_model, Sequential
from keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pickle import dump, load
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [4]:
# import file
def load_file(filename):
    """
    - load file from filename and return a really long string
    - file is expected to be cleaned beforehand
    """
    file = open(filename, encoding = "utf8")
    raw_text = file.read()[:20000]
    file.close()
    return raw_text


def create_dict(raw_text):
    """
    create character dictionary and mapping to indices from data file
    """
    char = sorted(set(raw_text))
    mapping = {c : i for i, c in enumerate(char)}
    vocab_size = len(mapping)
    return mapping, vocab_size

def fixed_len_seq(raw_text, seq_len):
    """
    split data into sequences with fixed lengths
    """
    sequences = list()
    for i in range(seq_len, len(raw_text)):
        seq = raw_text[i-seq_len:i+1]
        sequences.append(seq)
    return sequences

def encode(sequences, encoder):
    """
    encode sequences text into integers
    """
    result = []
    for seq in sequences:
        encoded_seq = [encoder[char] for char in seq]
        result.append(encoded_seq)
    return result
        

def create_xy(sequences, vocab_size):
    # change here to adapt expected input shape for RNN layers
    """
    convert encoded sequences into predictor & label
    """
    sequences = np.array(sequences)
    X, y = sequences[:,:-1], sequences[:,-1]
    sequences = [to_categorical(x, num_classes = vocab_size) for x in X]
    X = np.array(sequences)
    y = to_categorical(y, num_classes = vocab_size)
    return X,y

        
def preprocess(filename, seq_len):
    """
    preprocess file intro appropriate format for Keras
    """
    raw_text = load_file(filename)
    sequences = fixed_len_seq(raw_text, seq_len)
    mapping, vocab_size = create_dict(raw_text)
    vocab_size = 71
    encoded_seq = encode(sequences, mapping)
    x, y = create_xy(encoded_seq, vocab_size)
    return x, y, mapping, vocab_size

# **Data preprocessing**

In [7]:
X, y, mapping, vocab_size = preprocess('Dataset/TextAfterNormalize.txt', 64)

In [8]:
print(X.shape)
print(y.shape)

(19936, 64, 71)
(19936, 71)


In [11]:
# split data X, y into train val test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)

In [14]:
print(X_train.shape)
print(y_train.shape)
print('=====================')
print(X_val.shape)
print(y_val.shape)
print('=====================')
print(X_test.shape)
print(y_test.shape)

(14353, 64, 71)
(14353, 71)
(3589, 64, 71)
(3589, 71)
(1994, 64, 71)
(1994, 71)


# **Model training**

In [15]:
# model 1
# define model
batch_size = 64
drop_rate = 0.5
seq_len = 64
embedding_size = 32
rnn_size = 512
num_layers = 3
clip_norm = 5
learning_rate = 0.001

# build model
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_shape = (seq_len,)))
model.add(Dropout(drop_rate))
for _ in range(num_layers-1):
    model.add(LSTM(rnn_size, return_sequences=True))
    model.add(Dropout(drop_rate))
model.add(LSTM(rnn_size, return_sequences=False))
model.add(Dropout(drop_rate))
model.add(Dense(vocab_size, activation='softmax'))
#model.add(Flatten())
#model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))
print(model.summary())


2023-05-22 09:37:50.338728: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-05-22 09:37:50.423530: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-05-22 09:37:50.423753: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 32)            2272      
                                                                 
 dropout (Dropout)           (None, 64, 32)            0         
                                                                 
 lstm (LSTM)                 (None, 64, 512)           1116160   
                                                                 
 dropout_1 (Dropout)         (None, 64, 512)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64, 512)           2099200   
                                                                 
 dropout_2 (Dropout)         (None, 64, 512)           0         
                                                                 
 lstm_2 (LSTM)               (None, 512)               2

2023-05-22 09:37:52.339486: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-22 09:37:52.341553: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-22 09:37:52.342758: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [98]:
# compile model
model.compile(loss="categorical_crossentropy", optimizer=Adam(),
              metrics=['accuracy'])
# fit model
model.fit(X_train, y_train, epochs=100, verbose=2)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 335s - loss: 3.7421 - acc: 0.0602
Epoch 2/100
 - 321s - loss: 3.6845 - acc: 0.0680
Epoch 3/100
 - 206s - loss: 3.6773 - acc: 0.0704
Epoch 4/100
 - 207s - loss: 3.6950 - acc: 0.0666
Epoch 5/100
 - 167s - loss: 3.6693 - acc: 0.0704
Epoch 6/100
 - 167s - loss: 3.5626 - acc: 0.0852
Epoch 7/100
 - 166s - loss: 3.4456 - acc: 0.1050
Epoch 8/100
 - 166s - loss: 3.3663 - acc: 0.1183
Epoch 9/100
 - 167s - loss: 3.3070 - acc: 0.1306
Epoch 10/100
 - 169s - loss: 3.2115 - acc: 0.1484
Epoch 11/100
 - 168s - loss: 3.1035 - acc: 0.1696
Epoch 12/100
 - 165s - loss: 3.0248 - acc: 0.1844
Epoch 13/100
 - 165s - loss: 2.9620 - acc: 0.1925
Epoch 14/100
 - 166s - loss: 2.9403 - acc: 0.2042
Epoch 15/100
 - 167s - loss: 2.8448 - acc: 0.2227
Epoch 16/100
 - 165s - loss: 2.7810 - acc: 0.2358
Epoch 17/100
 - 166s - loss: 2.7229 - acc: 0.2509
Epoch 18/100
 - 164s - loss: 2.6565 - acc: 0.2717
Epoch 19/100
 - 165s - loss: 2.6011 - acc: 0.2795
Epoch 20/10

<keras.callbacks.History at 0x7f64746f0358>

In [None]:
# save the model to file
model.save('Model/modelGen.h5')

# save the mapping
dump(mapping, open('Dict/mapping.pkl', 'wb'))

# **Generate text**

**model 2**

In [None]:
# load the model
model = load_model('Model/modelGen.h5')

# load the mapping
mapping = load(open('Dict/mapping.pkl', 'rb'))

In [None]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
	in_text = seed_text
	# generate a fixed number of characters
	for _ in range(n_chars):
		# encode the characters as integers
		encoded = [mapping[char] for char in in_text]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# one hot encode
		# encoded = to_categorical(encoded, num_classes=len(mapping))
		# predict character
		yhat = model.predict_classes(encoded, verbose=0)
		# reverse map integer to character
		out_char = ''
		for char, index in mapping.items():
			if index == yhat:
				out_char = char
				break
		# append to input
		in_text += char
	return in_text

In [219]:
a = generate_seq(model, mapping, 64, '๏จะ', 500)
print(a)

๏จะคุค้ฟั้วกบุบหยายพันจะไป
ขุนไกรได้ฟังให้แม่เข้าไป	หัวนักนั้นแล้วไปมาถึงมา
ครั้นถึงลุกนั้นเป็นเห็นนาง	เป็นเล่นหมุมขึงจะคลอน
พลิ้งโร่งคร่อยว่องเมือนสงา	พระเดชรับหวานอุกใสรรรยา	พลังนั่นมีแล้วก็ไม่
ทิ่งตระบื่นนี่ควาดรรดคร	ดังไร
พระสวนั่นอกลามอุหมาบเพียงแต้	ปู่ย่าตายายสบายใจ	ข้าไกให้เกิดมาช้า	อักเฉิดเห็นเห็นผิดตระสายร่วงาย
แล้วก็กเที่นายส่านสองไม้มา
เลี้ยงมาก็เดือนสนร้องไป	ขุนช้างเอาเห็นเห็นผิดกลับถัวนาน
นางแต่ล้วหน้ายวลยองคลอนค์ผิงดนกลายดแล้วนิน
เที่ยวถึงเย่นไม่
อยู่ในแล้วก็ได้
เที้ยงมาก็สมรับพลาน
จะ


# **Evaluate model on dev set**

In [209]:
model.evaluate(x_val, y_val)



[5.542001793510841, 0.16873996789727128]

   32/19936 [..............................] - ETA: 30s

# **Evaluate model on test set**

In [202]:
model.evaluate(X_test, y_test)



[10.327602405609326, 0.020064205457463884]