In [1]:
import tensorflow.keras as keras
from keras.preprocessing.text import Tokenizer
def get_tokenizer(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    return tokenizer

def max_length(lines):
    return max(len(x) for x in lines)

from keras.preprocessing.sequence import pad_sequences
def encode_sequences(tokenizer, length, lines):
    sequences = tokenizer.texts_to_sequences(lines)
    sequences = pad_sequences(sequences, maxlen=length, padding='post')
    return sequences

from keras.utils import to_categorical
from numpy import array
def encode_one_hot_sequences(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import Dense
def define_model(src_vocab, tgt_vocab, src_timesteps, tgt_timesteps, n_units):
    model = keras.models.Sequential([
        keras.layers.Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True),
        keras.layers.LSTM(n_units),
        keras.layers.RepeatVector(tgt_timesteps),
        keras.layers.LSTM(n_units, return_sequences=True),
        keras.layers.Dense(tgt_vocab, activation='softmax')
    ])
    #model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    #model.add(LSTM(n_units))
    #model.add(RepeatVector(tgt_timesteps))
    #model.add(LSTM(n_units, return_sequence=True))
    #model.add(TimeDistributed(Dense(tgt_vocab, activation='softmax')))
    return model

Using TensorFlow backend.


In [2]:
from snownlp import SnowNLP
def word_tokenize(text):
    result = ''
    s = SnowNLP(text)
    for w in s.words:
        word = w.strip()
        if word != '':
            result = result + ' ' + word
    return result

In [3]:
import os
def read_data():
    pairs = []
    DATA_PATH = os.path.join("../datasets", "cmn.txt")
    file = open(DATA_PATH, mode='r', encoding ='utf8')
    text = file.read()
    lines = text.strip().split('\n')
    for line in lines:
        splits = line.split('\t')
        src = line.split('\t')[0]
        tgt = word_tokenize(splits[1])
        pair = [src, tgt]
        pairs.append(pair)
    return pairs

In [4]:
from numpy.random import shuffle

N_TOTAL_SENTENCES = 10000
TRAINING_SIZE = 9000

pairs = read_data()
raw_dataset = array(pairs)
dataset = raw_dataset[:N_TOTAL_SENTENCES]
#shuffle(dataset)

train, test = dataset[:9000], dataset[9000:]
src = dataset[:, 0]
tgt = dataset[:, 1]

src_train = train[:, 0]
tgt_train = train[:, 1]

src_test = test[:, 0]
tgt_test = test[:, 1]

src_tok = get_tokenizer(src)
src_index = src_tok.word_index
src_length = max_length(src)
src_vocab_size = len(src_index) + 1

tgt_tok = get_tokenizer(tgt)
tgt_index = tgt_tok.word_index
tgt_length = max_length(tgt)
tgt_vocab_size = len(tgt_index) + 1

trainX = encode_sequences(src_tok, src_length, src_train)
trainY = encode_sequences(tgt_tok, tgt_length, tgt_train)
trainY = encode_one_hot_sequences(trainY, tgt_vocab_size)

testX = encode_sequences(src_tok, src_length, src_test)
testY = encode_sequences(tgt_tok, tgt_length, tgt_test)
testY = encode_one_hot_sequences(testY, tgt_vocab_size)

model = define_model(src_vocab_size, tgt_vocab_size, src_length, tgt_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
print(model.summary())

from keras.utils.vis_utils import plot_model
#plot_model(model, to_file='model.png', show_shapes=True)

filename='mtrans.h5'

from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=100, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)
model.save(filename)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 28, 256)           879360    
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 33, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 33, 256)           525312    
_________________________________________________________________
dense (Dense)                (None, 33, 5205)          1337685   
Total params: 3,267,669
Trainable params: 3,267,669
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
141/141 - 82s - loss: 1.9087 - val_loss: 1.5075

Epoch 00001: val_loss improved from inf to 1.5

141/141 - 113s - loss: 0.1764 - val_loss: 1.7552

Epoch 00060: val_loss did not improve from 1.38970
Epoch 61/100
141/141 - 109s - loss: 0.1699 - val_loss: 1.7542

Epoch 00061: val_loss did not improve from 1.38970
Epoch 62/100
141/141 - 111s - loss: 0.1625 - val_loss: 1.7794

Epoch 00062: val_loss did not improve from 1.38970
Epoch 63/100
141/141 - 107s - loss: 0.1548 - val_loss: 1.8001

Epoch 00063: val_loss did not improve from 1.38970
Epoch 64/100
141/141 - 106s - loss: 0.1491 - val_loss: 1.8118

Epoch 00064: val_loss did not improve from 1.38970
Epoch 65/100
141/141 - 110s - loss: 0.1450 - val_loss: 1.8077

Epoch 00065: val_loss did not improve from 1.38970
Epoch 66/100
141/141 - 110s - loss: 0.1412 - val_loss: 1.8366

Epoch 00066: val_loss did not improve from 1.38970
Epoch 67/100
141/141 - 107s - loss: 0.1399 - val_loss: 1.8314

Epoch 00067: val_loss did not improve from 1.38970
Epoch 68/100
141/141 - 113s - loss: 0.1327 - val_loss: 1.8595

Epoch 00068: val_loss did not improve 

In [5]:
import json
from numpy import array

def read_data(file):
    pairs = []
    f = open(file, mode='r', encoding ='utf8')
    for x in f:
        sample = json.loads(x)
        pair = [sample['source'], sample['target']]
        pairs.append(pair)
    return array(pairs)

def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

from tensorflow.keras.models import load_model
from numpy import argmax

model = load_model('mtrans.h5')
myData = read_data('tiny.json')[:10]

# from ipynb.fs.full.model import encode_sequences
# from ipynb.fs.full.model import src_tok
# from ipynb.fs.full.model import src_length
# from ipynb.fs.full.model import tgt_tok

myTest = encode_sequences(src_tok, src_length, myData[:,0])

for count, source in enumerate(myTest):
    source = source.reshape((1, source.shape[0]))
    translation = predict_sequence(model, tgt_tok, source)
    print(myData[:,0][count], source, translation, sep=':')

Tom's tired:[[112 188   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]:湯姆 累 了 。
They hugged:[[  46 1029    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0]]:他们 拥抱 。
hug me:[[1018   10    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0]]:为什么 是 。
i:[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]:我 我 相信 相信 相信 相信 相信 进 进 进 入 入 入 入 入 入 入 入 入 入 入 入 入 入 入 入 入 入 入 入 入 入 入
Unbelievable!:[[1463    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0]]:难以置信
i ran:[[  1 309   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]:我 喜欢 了 。
Got it:[[69  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]]:你 懂 了 