# [WORK IN PROGRESS]
# Temporal Attention Model for Neural Machine Translation
Unofficial implementation of paper: http://arxiv.org/abs/1608.02927

### Requirements:
 - [Keras](https://github.com/fchollet/keras)
 - [Tensorflow](https://github.com/tensorflow/tensorflow)
 - [Theano](https://github.com/Theano/Theano)
 - https://github.com/farizrahman4u/seq2seq Seq2Seq implemtation built on top of Keras

In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2



Populating the interactive namespace from numpy and matplotlib


### Downloading (French, English) language pair.

In [2]:
from tensorflow.models.rnn.translate import data_utils
data_dir = "/data/translate" # You may need to change that or create a sympolic link
vocab_size = 20000
pathes = data_utils.prepare_wmt_data(data_dir, vocab_size, vocab_size)
en2_path, fr2_path, en2013_path, fr2013_path, en_vocab_path, fr_vocab_path = pathes

In [3]:
import re

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

# https://github.com/nicolas-ivanov/tf_seq2seq_chatbot/blob/master/tf_seq2seq_chatbot/lib/data_utils.py

_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

# Regular expressions used to tokenize.
_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
_DIGIT_RE = re.compile(r"\d{3,}")

def read_vocab(vocab_path):
    vocab_list = []
    vocab_list.extend(_START_VOCAB)

    with open(vocab_path, 'br') as f:
        vocab_list.extend([s.decode("utf-8").strip() for s in f.readlines() if is_ascii(s)])

    words_to_ids = {w:i for (i, w) in enumerate(vocab_list)}
    ids_to_words = {i:w for (w, i) in words_to_ids.items()}
    return ids_to_words, words_to_ids

In [4]:
en_index, en_vocab = read_vocab(en_vocab_path)
fr_index, fr_vocab = read_vocab(fr_vocab_path)

def basic_tokenizer(sentence):
    """Very basic tokenizer: split the sentence into a list of tokens."""
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
    return [w.lower() for w in words if w]

def sentence_to_token_ids(sentence, vocabulary,
                          tokenizer=None, normalize_digits=True):
    """Convert a string to list of integers representing token-ids.

    For example, a sentence "I have a dog" may become tokenized into
    ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
    "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].

    Args:
    sentence: a string, the sentence to convert to token-ids.
    vocabulary: a dictionary mapping tokens to integers.
    tokenizer: a function to use to tokenize each sentence;
      if None, basic_tokenizer will be used.
    normalize_digits: Boolean; if true, all digits are replaced by 0s.

    Returns:
    a list of integers, the token-ids for the sentence.
    """
    if tokenizer:
        words = tokenizer(sentence)
    else:
        words = basic_tokenizer(sentence)
    if not normalize_digits:
        return [vocabulary.get(w, UNK_ID) for w in words]
  
    # Normalize digits by 0 before looking words up in the vocabulary.
    return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words]

def token_ids_to_sentence(ids, vocab_index):
    maybe_words = [vocab_index.get(_id) for _id in ids]
    return " ".join([w for w in maybe_words if w])

In [5]:
#FIXME: some non-ascii charachters
en_vocab_size = len(en_vocab) + 1
fr_vocab_size = len(fr_vocab) + 1

In [6]:
# test
print(len(en_vocab))
print(len(en_index))

19905
19905


In [7]:
ids = sentence_to_token_ids("A is me strategy stratégie", en_vocab)
print(ids)
token_ids_to_sentence(ids, en_index)

[15239, 22, 1511, 614, 3]


'a is me strategy'

### Reading dataset

In [8]:
pathes

('/data/translate/giga-fren.release2.ids20000.en',
 '/data/translate/giga-fren.release2.ids20000.fr',
 '/data/translate/newstest2013.ids20000.en',
 '/data/translate/newstest2013.ids20000.fr',
 '/data/translate/vocab20000.en',
 '/data/translate/vocab20000.fr')

In [9]:
def read_data(path):
    with open(path, 'r') as f:
        return [[int(x) for x in line.split(" ")] for line in f.read().splitlines()]

In [10]:
# e.g. [59, 3, 610, 9, 6251, 4, 3, 7, 3]
en_ids = read_data(en2013_path)
fr_ids = read_data(fr2013_path)

In [11]:
# Make it the same length (= the max length of the sentences) with zeros for shorter sentences
from keras.preprocessing.sequence import pad_sequences
en_set = pad_sequences(en_ids)
fr_set = pad_sequences(fr_ids)
en_max_length = en_set.shape[1]
fr_max_length = fr_set.shape[1]

Using TensorFlow backend.


In [12]:
token_ids_to_sentence(fr_set[0], fr_index)

'base mieux génétiquement du _UNK'

In [13]:
en_max_features = max(en_vocab.values())
fr_max_features = max(fr_vocab.values())
embedding_size = 64

In [14]:
print("number of samples:", en_set.shape[0])
print("en_max_length:", en_max_length)
print("fr_max_length:", fr_max_length)
print("en_max_features:", en_max_features)
print("fr_max_features:", fr_max_features)
print("embedding_size:", embedding_size)

number of samples: 3000
en_max_length: 110
fr_max_length: 126
en_max_features: 20003
fr_max_features: 20003
embedding_size: 64


In [15]:
en_set.shape[1]

110

### Embeding layer for en and fr 

In [16]:
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Merge, Dropout, RepeatVector, Permute, Activation, recurrent, LSTM, GRU
from keras.models import Sequential
from keras.layers.wrappers import TimeDistributed, Bidirectional

In [17]:
model = Sequential()
model.add(Embedding(en_max_features, embedding_size, input_length=en_max_length, mask_zero=True))
model.compile('rmsprop', 'mse')
en_embed = model.predict(en_set)
en_embed.shape

(3000, 110, 64)

In [18]:
model = Sequential()
model.add(Embedding(fr_max_features, embedding_size, input_length=fr_max_length, mask_zero=True))
model.compile('rmsprop', 'mse')
fr_embed = model.predict(fr_set)
fr_embed.shape

(3000, 126, 64)

### Building the model

In [19]:
# Experiment  1
hidden_size = 32
model = Sequential()
model.add(GRU(hidden_size, input_shape=(en_max_length, embedding_size))) # (3000, 110, 64) -> (3000, 32)
model.add(RepeatVector(fr_max_length)) # (3000, 32) -> (3000, 126, 32)
# model.add(TimeDistributed(Dense(fr_max_length)))
model.add(Bidirectional(GRU(embedding_size, return_sequences=True), merge_mode='sum')) # (3000, 126, 32) -> (3000, 126, 64)
model.output_shape

(None, 126, 64)

In [20]:
# Experiment  2
hidden_size = 32
model = Sequential()
model.add(Embedding(en_max_features, embedding_size, input_length=en_max_length, mask_zero=True))
model.add(Bidirectional(GRU(hidden_size), merge_mode='sum'))
model.add(RepeatVector(fr_max_length))
model.add(GRU(embedding_size, return_sequences=True))
print(model.output_shape)
model.compile('rmsprop', 'mse')
model.fit(en_set, fr_embed)

(None, 126, 64)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb3fec0ebe0>

In [21]:
hidden_size = 32
model = Sequential()
model.add(GRU(hidden_size, input_shape=(en_max_length, embedding_size))) # (3000, 110, 64) -> (3000, 32)
model.add(RepeatVector(fr_max_length)) # (3000, 32) -> (3000, 126, 32)
model.add(GRU(hidden_size, return_sequences=True)) # (3000, 126, 32) -> (3000, 126, 32)
model.add(TimeDistributed(Dense(embedding_size))) # (3000, 126, 32) -> (3000, 126, 64)
model.compile('rmsprop', 'mse')

model.fit(en_embed, fr_embed)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb3c2a97390>

In [23]:
p = model.predict(en_embed[0:1])[0]

## Other trials

### Model infered from Seq2Seq
```python
model = Sequential()
model.add(LSTM(hidden_dim, return_sequences=True, mask_zero=True))
model.add(Dropout(droupout))
model.add(LSTM(hidden_dim, )) # Encoder
model.add(Dropout(droupout))
model.add(RepeatVecotr(output_lenght))
model.add(LSTM(hidden_dim, return_sequences=True, )) # Decoder
model.add(LSTM(hidden_dim, return_sequences=True, ))
model.add(Droupout(droupout))
model.add(TimeDistributed(Dense(output_dim)))
model.compile('rmsprop', 'mse')
```

### https://github.com/fchollet/keras/issues/395

In the model I listed below with the english sentence as input and the entire french sentence as output. The RNN model will maintain state across each timestep as it predicts the output sentence, no extra work required on your behalf. You will however need to one hot encode and zero pad the output sequence (the french sentence) and have it do a softmax over all possible words for the output at each time step. The ys then are 3D, each row is a matrix of height - number of french words, and width - number of time steps.

```python
embedding_size = 50
hidden_size = 512
output_size = 20
maxlen = 60

model = Sequential()
model.add(JZS1(embedding_size, hidden_size)) # try using a GRU instead, for fun
model.add(Dense(hidden_size, hidden_size))
model.add(Activation('relu'))
model.add(RepeatVector(maxlen))
model.add(JZS1(hidden_size, hidden_size, return_sequences=True))
model.add(TimeDistributedDense(hidden_size, output_size, activation="softmax"))

model.compile(loss='mse', optimizer='adam')
```

In [85]:
from keras.layers.wrappers import TimeDistributed

embedding_size = 64
hidden_size = 512
embedding_size = EN_REPRESENTATION_SIZE
MAX_LEN = fr_set.shape[1]
max_features = FR_BOUND

print('Build model...')
model = Sequential()
model.add(Embedding(EN_REPRESENTATION_SIZE, EMBED_HIDDEN_SIZE, input_length=en_set.shape[1], mask_zero=True))
model.add(GRU(hidden_size)) # try using a GRU instead, for fun
model.add(Dense(hidden_size))
model.add(Activation('relu'))
model.add(RepeatVector(MAX_LEN))
model.add(GRU(hidden_size, return_sequences=True))
model.add(TimeDistributed(Dense(max_features, activation="softmax")))

model.compile(loss='mse', optimizer='adam')

Build model...


### Dummy seq2seq model

In [40]:
import seq2seq
from seq2seq.models import SimpleSeq2seq

model = Sequential()
model.add(Embedding(EN_REPRESENTATION_SIZE, EMBED_HIDDEN_SIZE, input_length=en_set.shape[1]))
model = SimpleSeq2seq(
        input_dim=EN_REPRESENTATION_SIZE,
        input_length=en_max_length,
        hidden_dim=50,
        output_length=FR_REPRESENTATION_SIZE,
        output_dim=fr_max_length)

model.compile(loss='mse', optimizer='rmsprop')

In [30]:
# model.fit(en_embed, fr_set)

In [83]:
# RNN = GRU
# EMBED_HIDDEN_SIZE = 50

# encoder = Sequential()
# encoder.add(Embedding(en_vocab_size, EMBED_HIDDEN_SIZE, input_length=en_max_length))

# decoder = Sequential()
# decoder.add(Embedding(fr_vocab_size, EMBED_HIDDEN_SIZE, input_length=fr_max_length))

# decoder.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=False))
# decoder.add(RepeatVector(en_max_length))

# model = Sequential()
# model.add(Merge([encoder, decoder], mode='sum'))
# model.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=False))
# model.add(Dropout(0.3))
# model.add(Dense(fr_vocab_size, activation='softmax'))

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [87]:
# p = model.predict([en_set, fr_set])

In [86]:
# import tensorflow as tf
# outputs = [int(np.argmax(logit, axis=0)) for logit in p[0]][0:fr_input_length]
# token_ids_to_sentence(outputs, fr_index)
# # [fr_ids[output] for output in outputs]
# # print(" ".join([tf.compat.as_str(fr_ids[output]) for output in outputs]))