In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np 
import time
import random
tf.enable_eager_execution()

In [None]:
en = open("data/deu-tok.en", "r", encoding="utf-8").readlines()
de = open("data/deu-tok.de", "r", encoding="utf-8").readlines()

en = [x.strip() for x in en]
de = [x.strip() for x in de]

# Remove the 10 longest
for i in range(10):
    i_mx = max(range(len(en)), key = lambda i : len(en[i].split(" ")))
    del en[i_mx]
    del de[i_mx]

In [None]:
plt.hist([len(x.split(" ")) for x in en], bins='auto', log=True);

In [None]:
plt.hist([len(x.split(" ")) for x in de], bins='auto', log=True);

In [None]:
def easy_subset(D1, D2, n):
    c1, c2 = {}, {}
    for d1, d2 in zip(D1, D2):
        for w in d1.split(" "):
            c1[w] = c1.get(w, 0) + 1
        for w in d2.split(" "):
            c2[w] = c2.get(w, 0) + 1
    
    inds = np.array(sorted(range(len(D1)), key = lambda i : min(c[w]  for d, c in [(D1, c1), (D2, c2)] for w in d[i].split(" ")), reverse=True)[:n])
    return np.take(D1, inds), np.take(D2, inds)

In [None]:
en, de = easy_subset(en, de, 20000)

In [None]:
def to_ints(D, max_vocab_size):
    counts = {}
    for s in D:
        for w in s.split(" "):
            counts[w] = counts.get(w, 0) + 1
    
    w2idx = {x[0]:4+i for i, x in enumerate(sorted(counts.items(), reverse=True, key = lambda p : p[1])[:max_vocab_size])}
    
    w2idx["<PAD>"] = 0
    w2idx["<START>"] = 1
    w2idx["<END>"] = 2
    w2idx["<UNK>"] = 3
    
    idx2w = {v:k for k,v in w2idx.items()}
    
    Didx = [[w2idx["<START>"]] + [w2idx.get(x, w2idx["<UNK>"]) for x in s.split(" ")] + [w2idx["<END>"]] for s in D]
    
    return Didx, w2idx, idx2w
    

In [None]:
en_idxs, en_w2idx, en_idx2w = to_ints(en, 100000)
de_idxs, de_w2idx, de_idx2w = to_ints(de, 50000)

en_maxlen = max(map(len, en_idxs))
de_maxlen = max(map(len, de_idxs))
en_vocab_size = len(en_w2idx)
de_vocab_size = len(de_w2idx)

In [None]:
inp_lengths = [len(x) for x in en_idxs]
tar_lengths = [len(x) for x in de_idxs]

In [None]:
inp = tf.keras.preprocessing.sequence.pad_sequences(en_idxs, maxlen=en_maxlen, padding="post")
tar = tf.keras.preprocessing.sequence.pad_sequences(de_idxs, maxlen=de_maxlen, padding="post")

In [None]:
BATCH_SIZE = 64
embedding_dim = 32
units = 64
learning_rate = 0.001

BUFFER_SIZE = len(inp)
N_BATCH = len(inp) // BATCH_SIZE

dataset = tf.data.Dataset.from_tensor_slices((inp, inp_lengths, tar_lengths, tar)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)



In [None]:
def extract_axis_1(data, ind):
    """
    Get specified elements along the first axis of tensor.
    :param data: Tensorflow tensor that will be subsetted.
    :param ind: Indices to take (one for each element along axis 0 of data).
    :return: Subsetted tensor.
    """

    batch_range = tf.range(tf.shape(data)[0])
    indices = tf.stack([batch_range, ind], axis=1)
    res = tf.gather_nd(data, indices)

    return res

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Encoder, self).__init__()
        
        self.units = units
        self.batch_size = batch_size
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        self.gru = tf.keras.layers.CuDNNGRU(units, 
                                        return_state = False,
                                        return_sequences = True,
                                        recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden, sequence_lengths):

        x = self.embedding(x)
        
        outputs = self.gru(x, initial_state=hidden)
        
        state = extract_axis_1(outputs, sequence_lengths - 1)
        
        return state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.units))


In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Decoder, self).__init__()
        
        self.batch_size = batch_size
        self.units = units
        self.vocab_size = vocab_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.CuDNNGRU(units,
                                        return_sequences = True,
                                        return_state = True,
                                        recurrent_initializer = 'glorot_uniform')
        
        self.logits = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        
        outputs, state = self.gru(x, initial_state=hidden)
        
        outputs = self.logits(outputs)
        outputs = tf.reshape(outputs, [self.batch_size, -1, self.vocab_size])
        
        return outputs, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.units))    

In [None]:
encoder = Encoder(en_vocab_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(de_vocab_size, embedding_dim, units, BATCH_SIZE)

In [None]:
# Training

optimizer = tf.train.AdamOptimizer(learning_rate)

epochs = 10

for epoch in range(epochs):
    
    start = time.time()
    epoch_loss = 0
    
    hidden_init = encoder.initialize_hidden_state()
    
    for (batch, (inp, inp_lengths, targ_lengths, targ)) in enumerate(dataset):
        
        max_inp_length = np.max(inp_lengths)
        inp = inp[:, :max_inp_length]

        max_tar_length = np.max(targ_lengths)
        targ = targ[:, :max_tar_length]
        
        with tf.GradientTape() as tape:
            
            enc_hidden = encoder(inp, hidden_init, inp_lengths)
            
            dec_inputs = targ[:, :-1]
            dec_targets = targ[:, 1:]
            
            logits, dec_hidden = decoder(dec_inputs, hidden = enc_hidden)
            
            mask = 1 - np.equal(dec_targets, 0)
            loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=dec_targets, logits=logits) * mask
            
            batch_loss = tf.reduce_mean(loss_)
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(batch_loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        epoch_loss += batch_loss
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.8f}'.format(epoch+1, batch, batch_loss.numpy()))
            
    print('Epoch loss {:.8f}'.format(epoch_loss.numpy()))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
    
    print("TEST")
    for i in range(2):
        print(" ".join(en_idx2w[x.numpy()] for x in inp[i, : inp_lengths[i]]), "   =>   ", end="")
        
        preds = tf.argmax(logits, axis = 2)
        
        print("<START> " + " ".join(de_idx2w[x.numpy()] for x in preds[i]))
        
    print("\n\n")


In [None]:
en_w2idx.keys()

In [None]:
s = "<START> May I go with Tom ? <END>"

print("Sentence: ", s)
print("Unks: ", [x for x in s.split(" ") if x not in en_w2idx])
print() 

max_target_length = len(s.split(" ")) * 2

inp_length = np.array([len(s.split(" "))] * BATCH_SIZE)

s = [[en_w2idx.get(w, en_w2idx["<UNK>"]) for w in s.split(" ")]] * BATCH_SIZE
s = tf.keras.preprocessing.sequence.pad_sequences(s, maxlen=16, padding="post")
s = tf.convert_to_tensor(s)

init_hidden = tf.zeros((BATCH_SIZE, units))

enc_hidden = encoder(s, init_hidden, inp_length)

dec_hidden = enc_hidden

dec_input = tf.expand_dims([de_w2idx["<START>"]]*BATCH_SIZE, 1)

result = "<START>"
for t in range(max_target_length):
    
    preds, dec_hidden = decoder(dec_input, dec_hidden)
    
    preds = tf.reshape(preds, [BATCH_SIZE, -1])

    pred = tf.argmax(preds, axis = 1)
    
    pred = pred[0].numpy()
    
    print(pred, tf.nn.softmax(preds)[0, pred].numpy())
    
    result += " " + de_idx2w[pred]
    
    if pred == de_w2idx["<END>"]: break
    
    dec_input = tf.expand_dims([pred]*BATCH_SIZE,1)
    

print("\nResult:", result)