# Neural Machine Translation
## PJ : 13518117 - Muhammad Firas

In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from numpy import argmax
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

## Preparing Text Data

In [2]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [3]:
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

### Cleaning Data

In [4]:
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

In [5]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [6]:
# load dataset
filename = 'ind.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-indonesia.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-indonesia.pkl
[run] => [lari]
[who] => [siapa]
[wow] => [wow]
[help] => [tolong]
[jump] => [lompat]
[jump] => [loncat]
[stop] => [berhenti]
[wait] => [tunggu]
[wait] => [tunggu]
[hurry] => [cepatlah]
[hurry] => [cepat]
[hurry] => [buruan]
[i see] => [begitu rupanya]
[smile] => [senyum]
[attack] => [serang]
[cheers] => [bersulang]
[freeze] => [jangan bergerak]
[get up] => [bangunlah]
[got it] => [aku mengerti]
[got it] => [mengerti]
[listen] => [dengar]
[no way] => [tak bisa]
[no way] => [tidak mungkin]
[really] => [benarkah]
[really] => [beneran]
[thanks] => [makasih]
[thanks] => [terima kasih]
[thanks] => [makasih]
[we try] => [kami mencoba]
[why me] => [kenapa harus saya]
[awesome] => [mengagumkan]
[come on] => [ayo]
[get out] => [keluar]
[goodbye] => [sampai jumpa]
[hold it] => [tahan]
[i agree] => [aku setuju]
[im sad] => [saya sedih]
[its ok] => [tidak apaapa]
[its me] => [ini aku]
[its me] => [ini aku]
[me too] => [aku juga]
[perfect] => [sempurna]
[see you] => [sa

### Data Splitting

In [7]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
dataset = load_clean_sentences('english-indonesia.pkl')

# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:7200], dataset[7200:]
# save
save_clean_data(dataset, 'english-indonesia-both.pkl')
save_clean_data(train, 'english-indonesia-train.pkl')
save_clean_data(test, 'english-indonesia-test.pkl')

Saved: english-indonesia-both.pkl
Saved: english-indonesia-train.pkl
Saved: english-indonesia-test.pkl


In [8]:
# load datasets
dataset = load_clean_sentences('english-indonesia-both.pkl')
train = load_clean_sentences('english-indonesia-train.pkl')
test = load_clean_sentences('english-indonesia-test.pkl')

In [9]:
# fit a tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [10]:
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [11]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare indoensia tokenizer
ind_tokenizer = create_tokenizer(dataset[:, 1])
ind_vocab_size = len(ind_tokenizer.word_index) + 1
ind_length = max_length(dataset[:, 1])
print('Indonesia Vocabulary Size: %d' % ind_vocab_size)
print('Indonesia Max Length: %d' % (ind_length))

English Vocabulary Size: 3947
English Max Length: 32
Indonesia Vocabulary Size: 4774
Indonesia Max Length: 25


In [12]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [13]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [14]:
# prepare training data
trainX = encode_sequences(ind_tokenizer, ind_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainX = encode_output(trainX, ind_vocab_size)
# prepare validation data
testX = encode_sequences(ind_tokenizer, ind_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testX = encode_output(testX, ind_vocab_size)

## Membuat model LSTM

In [15]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

# define model
model = define_model(eng_vocab_size, ind_vocab_size, eng_length, ind_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize defined model
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 256)           1010432   
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 25, 256)          0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 25, 256)           525312    
                                                                 
 time_distributed (TimeDistr  (None, 25, 4774)         1226918   
 ibuted)                                                         
                                                                 
Total params: 3,287,974
Trainable params: 3,287,974
Non-

### Train model

In [16]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainY, trainX, epochs=30, batch_size=64, validation_data=(testY, testX), callbacks=[checkpoint], verbose=2)

Epoch 1/30

Epoch 00001: val_loss improved from inf to 1.52151, saving model to model.h5
113/113 - 21s - loss: 2.3768 - accuracy: 0.7832 - val_loss: 1.5215 - val_accuracy: 0.7926 - 21s/epoch - 182ms/step
Epoch 2/30

Epoch 00002: val_loss improved from 1.52151 to 1.45898, saving model to model.h5
113/113 - 7s - loss: 1.4963 - accuracy: 0.7954 - val_loss: 1.4590 - val_accuracy: 0.7990 - 7s/epoch - 62ms/step
Epoch 3/30

Epoch 00003: val_loss improved from 1.45898 to 1.43552, saving model to model.h5
113/113 - 7s - loss: 1.4435 - accuracy: 0.7963 - val_loss: 1.4355 - val_accuracy: 0.8019 - 7s/epoch - 62ms/step
Epoch 4/30

Epoch 00004: val_loss improved from 1.43552 to 1.43288, saving model to model.h5
113/113 - 7s - loss: 1.4148 - accuracy: 0.7982 - val_loss: 1.4329 - val_accuracy: 0.8013 - 7s/epoch - 62ms/step
Epoch 5/30

Epoch 00005: val_loss did not improve from 1.43288
113/113 - 7s - loss: 1.4039 - accuracy: 0.7983 - val_loss: 1.4351 - val_accuracy: 0.8020 - 7s/epoch - 61ms/step
Epoch 

<keras.callbacks.History at 0x7fe6a0762390>

In [17]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

In [18]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
	prediction = model.predict(source, verbose=0)[0]
	integers = [argmax(vector) for vector in prediction]
	target = list()
	for i in integers:
		word = word_for_id(i, tokenizer)
		if word is None:
			break
		target.append(word)
	return ' '.join(target)

### Evaluasi hasil train model

In [21]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
  actual, predicted = list(), list()
  for i, source in enumerate(sources):
    # translate encoded source text
    source = source.reshape((1, source.shape[0]))
    translation = predict_sequence(model, ind_tokenizer, source)
    # print(raw_dataset[i][0])
    # raw_target, raw_src = raw_dataset[i]
    raw_target = raw_dataset[i][1]
    raw_src = raw_dataset[i][0]
    if i < 10:
      print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
    actual.append([raw_target.split()])
    predicted.append(translation.split())
  # calculate BLEU score
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [22]:
# load model
model = load_model('model.h5')

# test on some test sequences
print('test')
evaluate_model(model, ind_tokenizer, testY, test)

test
src=[how many people are on the list now], target=[berapa banyak orang yang ada di dalam daftar sekarang], predicted=[tom tom tom yang yang hari]
src=[tom retired from coaching in], target=[tom berhenti melatih pada tahun], predicted=[tom adalah yang di yang]
src=[you dont need this], target=[kau tidak memerlukan ini], predicted=[apa kau kamu pergi]
src=[why would you want talk to tom about that], target=[kenapa kau ingin membicarakan hal itu dengan tom], predicted=[apa tidak tidak yang yang yang]
src=[i think youre right], target=[aku pikir kamu benar], predicted=[aku tidak tidak]
src=[japanese companies generally provide their employees with uniforms], target=[perusahan jepang umumnya menyediakan seragam untuk para karyawannya], predicted=[tom itu itu dan dan yang yang yang]
src=[my father doesnt like football], target=[ayahku tidak suka sepak bola], predicted=[aku harus ke ke]
src=[could you please repeat it once again], target=[bisakah anda mengulanginya sekali lagi], predicte

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.157234
BLEU-2: 0.051310
BLEU-3: 0.021720
BLEU-4: 0.039971
