# [WORK IN PROGRESS]
# Temporal Attention Model for Neural Machine Translation
Unofficial implementation of paper: http://arxiv.org/abs/1608.02927

### Requirements:
 - [Keras](https://github.com/fchollet/keras)
 - [Tensorflow](https://github.com/tensorflow/tensorflow)
 - [Theano](https://github.com/Theano/Theano)
 - https://github.com/farizrahman4u/seq2seq Seq2Seq implemtation built on top of Keras

In [5]:
%pylab inline
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Downloading (French, English) language pair.

In [6]:
from tensorflow.models.rnn.translate import data_utils
data_dir = "/data/translate" # You may need to change that or create a sympolic link
vocab_size = 20000
pathes = data_utils.prepare_wmt_data(data_dir, vocab_size, vocab_size)
en2_path, fr2_path, en2013_path, fr2013_path, en_vocab_path, fr_vocab_path = pathes

In [7]:
import re

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

# https://github.com/nicolas-ivanov/tf_seq2seq_chatbot/blob/master/tf_seq2seq_chatbot/lib/data_utils.py

_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

# Regular expressions used to tokenize.
_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
_DIGIT_RE = re.compile(r"\d{3,}")

def read_vocab(vocab_path):
    vocab_list = []
    vocab_list.extend(_START_VOCAB)

    with open(vocab_path, 'br') as f:
        vocab_list.extend([s.decode("utf-8").strip() for s in f.readlines() if is_ascii(s)])

    words_to_ids = {w:i for (i, w) in enumerate(vocab_list)}
    ids_to_words = {i:w for (w, i) in words_to_ids.items()}
    return ids_to_words, words_to_ids

In [8]:
en_index, en_vocab = read_vocab(en_vocab_path)
fr_index, fr_vocab = read_vocab(fr_vocab_path)

def basic_tokenizer(sentence):
    """Very basic tokenizer: split the sentence into a list of tokens."""
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
    return [w.lower() for w in words if w]

def sentence_to_token_ids(sentence, vocabulary,
                          tokenizer=None, normalize_digits=True):
    """Convert a string to list of integers representing token-ids.

    For example, a sentence "I have a dog" may become tokenized into
    ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
    "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].

    Args:
    sentence: a string, the sentence to convert to token-ids.
    vocabulary: a dictionary mapping tokens to integers.
    tokenizer: a function to use to tokenize each sentence;
      if None, basic_tokenizer will be used.
    normalize_digits: Boolean; if true, all digits are replaced by 0s.

    Returns:
    a list of integers, the token-ids for the sentence.
    """
    if tokenizer:
        words = tokenizer(sentence)
    else:
        words = basic_tokenizer(sentence)
    if not normalize_digits:
        return [vocabulary.get(w, UNK_ID) for w in words]
  
    # Normalize digits by 0 before looking words up in the vocabulary.
    return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words]

def token_ids_to_sentence(ids, vocab_index):
    maybe_words = [vocab_index.get(_id) for _id in ids]
    return " ".join([w for w in maybe_words if w])

In [9]:
#FIXME: some non-ascii charachters
en_vocab_size = len(en_vocab) + 1
fr_vocab_size = len(fr_vocab) + 1

In [10]:
# test
print(len(en_vocab))
print(len(en_index))

19905
19905


In [11]:
ids = sentence_to_token_ids("A is me strategy stratégie", en_vocab)
print(ids)
token_ids_to_sentence(ids, en_index)

[15239, 22, 1511, 614, 3]


'a is me strategy'

### Reading dataset

In [12]:
pathes

('/data/translate/giga-fren.release2.ids20000.en',
 '/data/translate/giga-fren.release2.ids20000.fr',
 '/data/translate/newstest2013.ids20000.en',
 '/data/translate/newstest2013.ids20000.fr',
 '/data/translate/vocab20000.en',
 '/data/translate/vocab20000.fr')

In [13]:
def read_data(path):
    with open(path, 'r') as f:
        return [[int(x) for x in line.split(" ")] for line in f.read().splitlines()]

In [14]:
# e.g. [59, 3, 610, 9, 6251, 4, 3, 7, 3]
en_ids = read_data(en2013_path)
fr_ids = read_data(fr2013_path)

In [15]:
# Make it the same length (= the max length of the sentences) with zeros for shorter sentences
from keras.preprocessing.sequence import pad_sequences
en_set = pad_sequences(en_ids)
fr_set = pad_sequences(fr_ids)
en_max_length = en_set.shape[1]
fr_max_length = fr_set.shape[1]

Using Theano backend.
Using gpu device 0: GeForce GTX 980M (CNMeM is disabled, cuDNN 5005)


In [82]:
token_ids_to_sentence(fr_set[0], fr_index)

'base mieux génétiquement du _UNK'

### Building the model

In [20]:
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Merge, Dropout, RepeatVector, Permute, Activation, recurrent, LSTM, GRU
from keras.models import Sequential

### Embeding layer for en and fr 

In [70]:
EN_BOUND = max(en_vocab.values())
model = Sequential()
model.add(Embedding(EN_BOUND, EMBED_HIDDEN_SIZE, input_length=en_set.shape[1]))
model.compile('rmsprop', 'mse')
en_embed = model.predict(en_set)
en_embed.shape

(3000, 110, 50)

In [71]:
FR_BOUND = max(fr_vocab.values())
model = Sequential()
model.add(Embedding(FR_BOUND, EMBED_HIDDEN_SIZE, input_length=fr_set.shape[1]))
model.compile('rmsprop', 'mse')
fr_embed = model.predict(fr_set)
fr_embed.shape

(3000, 126, 50)

### Dummy seq2seq model

In [88]:
import seq2seq
from seq2seq.models import SimpleSeq2seq

model = SimpleSeq2seq(input_dim=en_max_length, hidden_dim=50, output_length=fr_vocab_size, output_dim=fr_max_length)
model.compile(loss='mse', optimizer='rmsprop')

In [96]:
model.fit(en_embed, fr_embed)

Exception: Error when checking model input: expected lstm_input_4 to have shape (None, None, 110) but got array with shape (3000, 110, 50)

In [83]:
# RNN = GRU
# EMBED_HIDDEN_SIZE = 50

# encoder = Sequential()
# encoder.add(Embedding(en_vocab_size, EMBED_HIDDEN_SIZE, input_length=en_max_length))

# decoder = Sequential()
# decoder.add(Embedding(fr_vocab_size, EMBED_HIDDEN_SIZE, input_length=fr_max_length))

# decoder.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=False))
# decoder.add(RepeatVector(en_max_length))

# model = Sequential()
# model.add(Merge([encoder, decoder], mode='sum'))
# model.add(RNN(EMBED_HIDDEN_SIZE, return_sequences=False))
# model.add(Dropout(0.3))
# model.add(Dense(fr_vocab_size, activation='softmax'))

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [87]:
# p = model.predict([en_set, fr_set])

In [86]:
# import tensorflow as tf
# outputs = [int(np.argmax(logit, axis=0)) for logit in p[0]][0:fr_input_length]
# token_ids_to_sentence(outputs, fr_index)
# # [fr_ids[output] for output in outputs]
# # print(" ".join([tf.compat.as_str(fr_ids[output]) for output in outputs]))