In [1]:
import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, \
  Bidirectional, , Concatenate, Activation, Dot, Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K

import numpy as np
import matplotlib.pyplot as plt

if len(K.tensorflow_backend._get_available_gpus()) > 0:
  from keras.layers import CuDNNLSTM as LSTM
  from keras.layers import CuDNNGRU as GRU

import jiebae

Using TensorFlow backend.


In [23]:
# make sure we do softmax over the time axis
# expected shape is N x T x D
# note: the latest version of Keras allows you to pass in axis arg
def softmax_over_time(x):
  assert(K.ndim(x) > 2)
  e = K.exp(x - K.max(x, axis=1, keepdims=True))
  s = K.sum(e, axis=1, keepdims=True)
  return e / s

In [2]:
# config
BATCH_SIZE = 64
EPOCHS = 100
LATENT_DIM = 256
LATENT_DIM_DECODER = 256 # idea: make it different to ensure things all fit together properly!
NUM_SAMPLES = 10000
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

In [9]:
input_texts = []
target_texts = [] 
target_texts_inputs = [] 

t = 0
for line in open('../../machine_learning_examples/large_files/translation/cmn.txt', encoding='utf-8'):
  t += 1
  if t > NUM_SAMPLES:
    break

  if '\t' not in line:
    continue

  input_text, translation, _ = line.rstrip().split('\t')

  target_text = translation 
  target_text_input = translation

  input_texts.append(input_text)
  target_texts.append(target_text)
  target_texts_inputs.append(target_text_input)
print("num samples:", len(input_texts))

num samples: 10000


In [6]:
# tokenize the inputs
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

# get the word to index mapping for input language
word2idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))

Found 3472 unique input tokens.


In [7]:
max_len_input = max(len(s) for s in input_sequences)

In [10]:
#for chinese, using jieba to tokenize => establish word2idx => establish target sequences

stop_words = []
with open('../../machine_learning_examples/corpus/stop_words.txt') as f:
    for line in f:
        stop_words.append(line.rstrip())

In [14]:
target_seg = []
target_input_seg = []
for t in target_texts:
    seg = jieba.lcut(t)
    seg = [s for s in seg if s not in stop_words]
    target_seg.append(seg + ['<eos>'])
    target_input_seg.append(['<sos>'] + seg)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\AMOCH~1.LIU\AppData\Local\Temp\jieba.cache
Loading model cost 0.584 seconds.
Prefix dict has been built successfully.


In [15]:
word2idx_outputs = {}

for L in target_seg:
    for token in L + ['<sos>']:
        if token not in word2idx_outputs:
            word2idx_outputs[token] = len(word2idx_outputs) + 1

In [16]:
target_sequences = []
target_input_sequences = []

for L in target_seg:
    sequence = []
    for seg in L:
        sequence.append(word2idx_outputs.get(seg))
    target_sequences.append(sequence)

for L in target_input_seg:
    sequence = []
    for seg in L:
        sequence.append(word2idx_outputs.get(seg))
    target_input_sequences.append(sequence)

In [17]:
print('Found %s unique output tokens.' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_len_target = max(len(s) for s in target_sequences)

Found 6651 unique output tokens.


In [18]:
# pad the sequences
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)
print("encoder_inputs.shape:", encoder_inputs.shape)
print("encoder_inputs[0]:", encoder_inputs[0])

decoder_inputs = pad_sequences(target_input_sequences, maxlen=max_len_target, padding='post')
print("decoder_inputs[0]:", decoder_inputs[0])
print("decoder_inputs.shape:", decoder_inputs.shape)

decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')

encoder_inputs.shape: (10000, 9)
encoder_inputs[0]: [  0   0   0   0   0   0   0   0 911]
decoder_inputs[0]: [3 1 0 0 0 0 0 0 0 0 0 0 0]
decoder_inputs.shape: (10000, 13)


In [19]:
# store all the pre-trained word vectors
word2vec = {}
with open(os.path.join('../../machine_learning_examples/large_files/glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM), encoding='utf-8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
    
print('Found %s word vectors.' % len(word2vec))

Found 400000 word vectors.


In [20]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
  if i < MAX_NUM_WORDS:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

In [21]:
# create embedding layer
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=max_len_input
)

In [22]:
# create targets, since we cannot use sparse
# categorical cross entropy when we have sequences
decoder_targets_one_hot = np.zeros(
  (
    len(input_texts),
    max_len_target,
    num_words_output
  ),
  dtype='float32'
)

# assign the values
for i, d in enumerate(decoder_targets):
  for t, word in enumerate(d):
    decoder_targets_one_hot[i, t, word] = 1

In [24]:
##### build the model #####

# Set up the encoder
encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = Bidirectional(LSTM(
  LATENT_DIM,
  return_sequences=True,
  # dropout=0.5 
))
encoder_outputs = encoder(x)


# Set up the decoder 
decoder_inputs_placeholder = Input(shape=(max_len_target,))

# this word embedding will not use pre-trained vectors
# although you could
decoder_embedding = Embedding(num_words_output, EMBEDDING_DIM)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [None]:
######### Attention #########
# Attention layers need to be global because they will be repeated Ty times at the decoder
attn_repeat_layer = RepeatVector(max_len_input)
attn_concat_layer = Concatenate(axis=-1)
attn_dense1 = Dense(10, activation='tanh')
attn_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1) # to perform the weighted sum of alpha[t] * h[t]

In [29]:
X = 8
N = 3

sample = np.random.randn(N, X)

input_ = Input(shape=(X, ))
dense = Dense(10, activation='tanh')
output = dense(input_)

model = Model(inputs=input_, outputs=output)
pred = model.predict(X)

AttributeError: 'int' object has no attribute 'ndim'

In [28]:
pred.shape

(1, 8, 10)