In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# Open glove and tokenize-pad

In [2]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print( "Done.",len(model)," words loaded!")
    return model

In [3]:
model_glove = loadGloveModel('../glove/glove.6B.100d.txt')

Loading Glove Model
Done. 400000  words loaded!


In [4]:
glove_df = pd.DataFrame(model_glove).T

unk_pad_df = pd.DataFrame(columns=glove_df.columns)
unk_pad_df.loc['<PAD>'] = np.zeros(glove_df.shape[1])
unk_pad_df.loc['<UNK>'] = glove_df.mean()

glove_unk_df = pd.concat([unk_pad_df,glove_df])

In [5]:
import os
files = []
for i in os.listdir('data/'):
    if i.endswith('.txt'):
        files.append(i)



In [6]:
df = pd.DataFrame(columns=['dialogue'])
for i in files:
    df_loop = pd.read_table('data/'+i, sep="\n", header=None)
    df_loop = df_loop.rename(columns={0:'dialogue'})
    df = df.append(df_loop)


In [7]:
df = df.reset_index(drop=True)

In [8]:
import re

def delete_blank(x):
    if x=='':
        return None
    else:
        return x
df.dialogue = df.dialogue.map(delete_blank)

In [9]:
df = df.dropna().reset_index(drop=True)

In [10]:
def delete_parentheses(x):
    return re.sub(r'\(.*\)|\[.*\]', '', x)


In [11]:
df.dialogue = df.dialogue.map(lambda x: delete_parentheses(x))
df.dialogue = df.dialogue.map(delete_blank)
df = df.dropna().reset_index(drop=True)

In [12]:
def separate_punctuation(x):
#     x = re.sub(r'\.',' . ', x)
#     x = re.sub(r'\,',' , ', x)
#     x = re.sub(r'\!',' ! ', x)
#     x = re.sub(r'\?',' ? ', x)
#     x = re.sub(r'\:',' : ', x) 
    x = re.sub(r'\.',' ', x)
    x = re.sub(r'\,',' ', x)
    x = re.sub(r'\!',' ', x)
    x = re.sub(r'\?',' ', x)
#     x = re.sub(r'\:',' ', x) 
    return x

def delete_large_spaces(x):
    return re.sub(r'\s{2,}', ' ', x)

In [13]:
df.dialogue = df.dialogue.map(lambda x: separate_punctuation(x))

In [14]:
df.dialogue = df.dialogue.str.lower()

In [15]:
def change_words(x):
    x = re.sub("let's", 'let us', x)
    x = re.sub("let’s", 'let us', x)
    x = re.sub("c'mon", 'come on', x)
    x = re.sub("c’mon", 'come on', x)
    x = re.sub("there’s", 'there is', x)
    x = re.sub("there's", 'there is', x)
    x = re.sub("you're", 'you are', x)
    x = re.sub("you’re", 'you are', x)
    x = re.sub("we're", 'we are', x)
    x = re.sub("we’re", 'we are', x)
    x = re.sub("i'm", 'i am', x)
    x = re.sub("i’m", 'i am', x)
    x = re.sub("y'", 'you', x)
    x = re.sub("y’", 'you', x)
    x = re.sub("how'd", 'how did', x)
    x = re.sub("how’d", 'how did', x)
    x = re.sub("\'ll", ' will', x)
    x = re.sub("\’t", ' not', x)
    x = re.sub("\'t", ' not', x)
    x = re.sub("\'s", '  is', x)
    x = re.sub("\’s", '  is', x)
    x = re.sub("\'re", '  are', x)
    x = re.sub("\’re", '  are', x)
    x = re.sub("\'", ' ', x)
    x = re.sub('\"', ' ', x)
    x = re.sub('-', ' ', x)
    x = re.sub('pheebs', 'phoebe', x)
    x = re.sub('wasn', 'was not', x)
    x = re.sub('noo', 'no', x)
    x = re.sub("didn", 'did', x)
    
    
    return x

In [16]:
df.dialogue = df.dialogue.map(change_words)

In [17]:
def delete_no_dialogue(x):
    if ':' in x:
        return x
    

In [18]:
df = df.head(1000)

In [19]:
df.dialogue = df.dialogue.map(delete_no_dialogue)


In [20]:
df = df.dropna()

In [21]:
def delete_names(x):
    x = re.sub(r'[a-z]{2,}:','', x)
    return x

def delete_semicol(x):
    x = re.sub(r'\:','', x)
    return x

In [22]:
df.dialogue = df.dialogue.map(delete_names)
df.dialogue = df.dialogue.map(delete_large_spaces)


In [23]:
df = df.dropna().reset_index(drop=True)
df.dialogue = df.dialogue.map(delete_semicol)

In [24]:
# get only same kind of length conversations

max_len = 10
df['answer'] = df.dialogue.shift(-1)
df['counts_dialogue'] = df.dialogue.map(lambda x: len(x.split()))
df = df.drop(len(df)-1)
df['counts_answer'] = df.answer.map(lambda x: len(x.split()))
# df = df[(df.counts_dialogue<max_len+1)&(df.counts_answer<max_len+1)]
df = df.reset_index(drop=True)

#### for memory reasons, use just the vocabulary from friends

In [25]:
top_words = 5000

script_string = list(df.answer.as_matrix().flatten())
most_common_words = pd.Series(''.join(script_string).split()).value_counts().head(top_words)
vocab_friends = pd.Series(most_common_words.index)
vocab_friends = pd.Series(['<PAD>', '<UNK>']).append(vocab_friends)


In [26]:
glove_unk_friends_df = glove_unk_df[glove_unk_df.index.isin(vocab_friends)]
voc_df = pd.DataFrame(glove_unk_friends_df.index, columns=['voc']).reset_index()
voc_df.voc = voc_df.voc.str.lower()
voc_df = voc_df.set_index('voc')
voc_dic = voc_df.to_dict()['index']

### Tokenize and pad

In [27]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


class TokenizerCustom(Tokenizer):
    def __init__(self, voc, max_len=max_len, *args, **kwargs):
        super(TokenizerCustom, self).__init__(*args, **kwargs)
        self.max_len = max_len
        self.word_index = voc
        self.oov_token = '<unk>'
        self.filters = '#$%&()*+-/<=>@[\]^_`{|}~.,'
    def pad_string(self, x):
        return pad_sequences(x, maxlen=self.max_len)
    
    def tokenize_string(self, x):
        tok_str = self.texts_to_sequences(pd.Series(x).values)
        return self.pad_string(tok_str)[0]

        

Using TensorFlow backend.


In [28]:
# test unk
tc = TokenizerCustom(voc=voc_dic, oov_token=voc_dic['<unk>'], max_len=max_len)
tc.tokenize_string(df.answer.loc[9])

array([   0,    0,    0, 1457,   90,    1,    1, 1033,  623,   65],
      dtype=int32)

##### tokenize data

In [29]:
voc_dic_inv = {voc_dic[x]:x for x in voc_dic}

X = df.dialogue.map(lambda x: tc.tokenize_string(x))
X = np.array(X.tolist())
y = df.answer.map(lambda x: tc.tokenize_string(x))
y = np.array(y.tolist())

In [30]:
pd.Series(y[0]).map(voc_dic_inv)

0       you
1      guys
2       can
3        we
4    please
5       not
6     watch
7      this
8       all
9     right
dtype: object

In [31]:
def remap_words_overall(y, times, myid, voc_dic_inv):
    len_shape = len(voc_dic_inv)
    new_index = len_shape
    i = 0
    row, col = np.where(y==myid)
    for pos in range(times+1):
        row_loop, col_loop = row[i:i+int(len(row)/times)], col[i:i+int(len(row)/times)]
        y[row_loop, col_loop] = new_index
        new_index += 1
        i += int(len(row)/times)
        voc_dic_inv[new_index] = voc_dic_inv[myid]

    return y, voc_dic_inv



num_col = y.shape[1]
voc_dic_inv_copy = voc_dic_inv.copy()
counts_ser = pd.Series(y.flatten()).value_counts()
data_words = pd.Series(y.flatten()).value_counts().index
min_count = pd.Series(y.flatten()).value_counts().iloc[-1]

for i in range(len(data_words)):
    word = data_words[i]
    y, voc_dic_inv = remap_words_overall(y, int(counts_ser.loc[word]/min_count), word, voc_dic_inv)


In [32]:
# def remap_words(y, times, myid, voc_dic_inv):
#     len_shape = len(voc_dic_inv)
#     new_index = len_shape
#     i = 0
#     row = np.where(y==myid)[0]
#     for pos in range(times):
#         row_loop = row[i:i+int(len(row)/times)]
#         y[row_loop] = new_index
#         i += int(len(row)/times)
#         voc_dic_inv[new_index] = voc_dic_inv[myid]
#         new_index += 1
#     return y, voc_dic_inv


# num_col = y.shape[1]
# dic_all_dic = {}
# for j in range(num_col):
#     voc_dic_inv_copy = voc_dic_inv.copy()
#     counts_ser = pd.Series(y[:,j]).value_counts()
#     data_words = pd.Series(y[:,j]).value_counts().index
#     min_count = pd.Series(y[:,j]).value_counts().iloc[-1]
#     print(min_count)
#     for i in range(len(data_words)):
#         word = data_words[i]
#         y[:,j], voc_dic_inv_copy = remap_words(y[:,j], int(counts_ser.loc[word]/min_count), word, voc_dic_inv_copy)
#     dic_all_dic[j] = voc_dic_inv_copy

In [33]:
# for i in range(len(y[0])):
#     print(dic_all_dic[i][y[0][i]])


# Train model

In [34]:
max_seq_len = len(X[0])
emb_dim = glove_unk_friends_df.shape[1]
vocab_dim = glove_unk_friends_df.shape[0]
# vocab_out_dim = len(voc_dic_inv)
vocab_out_dim = y.flatten().max()+1

In [35]:
from keras.utils import to_categorical
y

array([[ 5014,  8629,  7334, ...,  6904,  7846,  7809],
       [ 1464,  1465,  1466, ...,  1471,  1472, 11938],
       [ 9272, 11812, 10860, ..., 10965,  6579, 10797],
       ...,
       [ 5002,  5003,  5004, ...,  9152,  8082,  9912],
       [ 5005,  5006,  5007, ...,  7469,  6835,  7470],
       [ 5012,  7425,  7332, ...,  9816,  9882,  7844]], dtype=int32)

In [36]:
from keras.utils import to_categorical

# # one hot encode target sequence
# def encode_output(sequences, vocab_size):
#     ylist = list()
#     for sequence in sequences:
#         encoded = to_categorical(sequence, num_classes=vocab_size)
#         ylist.append(encoded)
#     y = np.array(ylist, dtype=np.uint8)
# #     y = y.reshape(y.shape[0], vocab_size, y.shape[1])

#     return y

# y_enc = encode_output(y, vocab_out_dim)


y_enc = to_categorical(y)

In [37]:
# y_enc = y_enc_copy.reshape(y_enc_copy.shape[0], vocab_dim, y_enc_copy.shape[1])

In [38]:
from keras.models import Model
from keras.layers import Embedding, Input, Dense, LSTM, Dropout, RepeatVector, Flatten, Activation, Permute, Reshape
from keras.layers.wrappers import TimeDistributed

from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, SGD
from keras import regularizers



# inp = Input(shape=(max_seq_len,))
# emb = Embedding(vocab_dim, emb_dim, weights=[glove_unk_friends_df], 
#                 input_length=max_seq_len, trainable=False, mask_zero=True)(inp)


# lstm_in = LSTM(500, dropout=0.0)(emb)
# rep_vec = RepeatVector(max_seq_len)(lstm_in)
# rep_vec = TimeDistributed(Dense(vocab_dim, activation='softmax'))(rep_vec)
# lstm_out = LSTM(500, dropout=0.0, return_sequences=True)(rep_vec)
# out = TimeDistributed(Dense(vocab_dim, activation='softmax'))(lstm_out)


# out = AttentionDecoder(150, vocab_dim)(lstm_in)


In [39]:
from keras.layers import multiply, BatchNormalization

inp = Input(shape=(max_seq_len,))
emb = Embedding(vocab_dim, 100, weights=[glove_unk_friends_df], 
                input_length=max_seq_len, trainable=False, mask_zero=True)(inp)
emb = BatchNormalization()(emb)
lstm_in = LSTM(200, dropout=0, return_sequences=True)(emb)
lstm_in = LSTM(200, dropout=0)(lstm_in)

rep_vec = RepeatVector(max_seq_len)(lstm_in)
rep_vec = TimeDistributed(Dense(vocab_out_dim, activation='relu', W_regularizer=regularizers.l2(0)))(rep_vec)

# lstm_in_2 = LSTM(400, dropout=0)(emb)
# rep_vec_2 = RepeatVector(max_seq_len)(lstm_in)
# rep_vec_2 = TimeDistributed(Dense(vocab_out_dim, activation='relu', W_regularizer=regularizers.l2(0.01)))(rep_vec_2)

# rep_vec = multiply([rep_vec, rep_vec_2])

lstm_out = LSTM(200, dropout=0, return_sequences=True)(rep_vec)
# lstm_out = LSTM(200, dropout=0, return_sequences=True)(lstm_out)

out = TimeDistributed(Dense(vocab_out_dim, activation='softmax', W_regularizer=regularizers.l2(0)))(lstm_out)


In [40]:
# from keras.models import Sequential

# model = Sequential()
# model.add( Embedding(vocab_dim, emb_dim, weights=[glove_unk_friends_df], 
#                 input_length=max_seq_len, trainable=False, mask_zero=False))
# model.add(LSTM(200, return_sequences=True))
# model.add(LSTM(200, return_sequences=True))
# if True:
#     model.add(Dropout(0.5))
# model.add(TimeDistributed(Dense(vocab_dim)))
# model.add(Activation('softmax'))

In [41]:
from keras.callbacks import ReduceLROnPlateau

earlystop = EarlyStopping(monitor='acc', min_delta=0.001, patience=5)
reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.2,
                              patience=5, min_lr=0.00001, verbose=1)
model = Model(inputs=inp, outputs=out)
# model = Model(inputs=inp, outputs=out)
# model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.0001), metrics=['accuracy'])
# model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.00001), metrics=['accuracy'], sample_weight_mode='temporal')
# model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'], sample_weight_mode='temporal')
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0005), metrics=['accuracy'])


In [42]:
# from keras.utils.vis_utils import plot_model

# plot_model(model, show_shapes=True, show_layer_names=True)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 100)           146400    
_________________________________________________________________
batch_normalization_1 (Batch (None, 10, 100)           400       
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 200)           240800    
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               320800    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 10, 200)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 10, 12483)         2509083   
__________

In [None]:
# model.fit(X, np.array(y.tolist()), epochs = 10, callbacks=[earlystop], batch_size=10)
model.fit(X, y_enc, epochs = 5000, callbacks=[reduce_lr])


Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000

In [None]:
# # encoder and decoder gloabal LSTM variables with 300 units
# LSTM_cell = LSTM(300, return_state = True)
# LSTM_decoder = LSTM(300, return_state = True, return_sequences = True)
# # final dense layer that uses TimeDistributed wrapper to generate 'vocab_size' softmax outputs for each time step in the decoder lstm
# dense = TimeDistributed(Dense(vocab_size, activation = 'softmax'))

# input_context = Input(shape = (maxLen, ), dtype = 'int32', name = 'input_context')
# input_target = Input(shape = (maxLen, ), dtype = 'int32', name = 'input_target')

# # pass the inputs into the embedding layer
# input_ctx_embed = embed_layer(input_context)
# input_tar_embed = embed_layer(input_target)

# # pass the embeddings into the corresponding LSTM layers
# encoder_lstm, context_h, context_c = LSTM_cell(input_ctx_embed)
# # the decoder lstm uses the final states from the encoder lstm as the initial state
# decoder_lstm, _, _ = LSTM_decoder(input_tar_embed, initial_state = [context_h, context_c],)0

In [None]:
a = np.array(pd.Series(['<pad>','what','my','name','is','ana','what','is','your','name']).map(voc_dic).fillna(1))

In [None]:
result = model.predict(a.reshape(1, X.shape[1]))[0]
result_df = pd.DataFrame(result).idxmax(axis=1, skipna=True)
print(np.array(result_df.map(voc_dic_inv)))

In [None]:
voc_dic_inv[419]

In [None]:

# for i in range(len(result)):
#     print(dic_all_dic[i][result_df.iloc[i]])

In [None]:
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import RepeatVector

# generate a sequence of random integers
def generate_sequence(length, n_unique):
	return [randint(0, n_unique-1) for _ in range(length)]

# one hot encode sequence
def one_hot_encode(sequence, n_unique):
	encoding = list()
	for value in sequence:
		vector = [0 for _ in range(n_unique)]
		vector[value] = 1
		encoding.append(vector)
	return array(encoding)

# decode a one hot encoded string
def one_hot_decode(encoded_seq):
	return [argmax(vector) for vector in encoded_seq]

# prepare data for the LSTM
def get_pair(n_in, n_out, cardinality):
	# generate random sequence
	sequence_in = generate_sequence(n_in, cardinality)
	sequence_out = sequence_in[:n_out] + [0 for _ in range(n_in-n_out)]
	# one hot encode
	X = one_hot_encode(sequence_in, cardinality)
	y = one_hot_encode(sequence_out, cardinality)
	# reshape as 3D
	X = X.reshape((1, X.shape[0], X.shape[1]))
	y = y.reshape((1, y.shape[0], y.shape[1]))
	return X,y

# configure problem
n_features = 50
n_timesteps_in = 5
n_timesteps_out = 2
# define model
# model = Sequential()
# model.add(LSTM(150, input_shape=(n_timesteps_in, n_features)))
# model.add(RepeatVector(n_timesteps_in))
# model.add(LSTM(150, return_sequences=True))
# model.add(TimeDistributed(Dense(n_features, activation='softmax')))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

inp = Input(shape=(n_timesteps_in, n_features))

lstm_in = LSTM(200, dropout=0.1)(inp)
rep_vec = RepeatVector(n_timesteps_in)(lstm_in)
rep_vec = TimeDistributed(Dense(vocab_dim, activation='softmax', W_regularizer=regularizers.l2(0.01)))(rep_vec)

lstm_in_2 = LSTM(200, dropout=0.1)(inp)
rep_vec_2 = RepeatVector(n_timesteps_in)(lstm_in)
rep_vec_2 = TimeDistributed(Dense(vocab_dim, activation='relu', W_regularizer=regularizers.l2(0.01)))(rep_vec_2)

rep_vec = multiply([rep_vec, rep_vec_2])
lstm_out = LSTM(200, dropout=0.1, return_sequences=True)(rep_vec)
out = TimeDistributed(Dense(n_features, activation='softmax', W_regularizer=regularizers.l2(0.01)))(lstm_out)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

# train LSTM
for epoch in range(5000):
	# generate new random sequence
	X, y = get_pair(n_timesteps_in, n_timesteps_out, n_features)
	X = X.reshape(X.shape[2],X.shape[1])
	# fit model for one epoch on this sequence
	model.fit(X, y, epochs=1, verbose=2)
# evaluate LSTM
total, correct = 100, 0
for _ in range(total):
	X,y = get_pair(n_timesteps_in, n_timesteps_out, n_features)
	yhat = model.predict(X, verbose=0)
	if array_equal(one_hot_decode(y[0]), one_hot_decode(yhat[0])):
		correct += 1
print('Accuracy: %.2f%%' % (float(correct)/float(total)*100.0))
# spot check some examples
for _ in range(10):
	X,y = get_pair(n_timesteps_in, n_timesteps_out, n_features)
	yhat = model.predict(X, verbose=0)
	print('Expected:', one_hot_decode(y[0]), 'Predicted', one_hot_decode(yhat[0]))
    

In [None]:
for _ in range(3):
    X,y = get_pair(n_timesteps_in, n_timesteps_out, n_features)
    yhat = model.predict(X, verbose=0)
    print(X)
    print(one_hot_decode(X[0]))
    print('Expected:', one_hot_decode(y[0]), 'Predicted', one_hot_decode(yhat[0]))

In [None]:
pd.Series(X[100]).map(voc_dic_inv)

In [None]:
glove_unk_friends_df.head()

In [None]:
import tensorflow as tf
from keras import backend as K
from keras import regularizers, constraints, initializers, activations
from keras.layers.recurrent import Recurrent
from keras.engine import InputSpec


import keras.backend as K


def _time_distributed_dense(x, w, b=None, dropout=None,
                            input_dim=None, output_dim=None,
                            timesteps=None, training=None):
    """Apply `y . w + b` for every temporal slice y of x.
    # Arguments
        x: input tensor.
        w: weight matrix.
        b: optional bias vector.
        dropout: wether to apply dropout (same dropout mask
            for every temporal slice of the input).
        input_dim: integer; optional dimensionality of the input.
        output_dim: integer; optional dimensionality of the output.
        timesteps: integer; optional number of timesteps.
        training: training phase tensor or boolean.
    # Returns
        Output tensor.
    """
    if not input_dim:
        input_dim = K.shape(x)[2]
    if not timesteps:
        timesteps = K.shape(x)[1]
    if not output_dim:
        output_dim = K.shape(w)[1]

    if dropout is not None and 0. < dropout < 1.:
        # apply the same dropout pattern at every timestep
        ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
        dropout_matrix = K.dropout(ones, dropout)
        expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
        x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)

    # collapse time dimension and batch dimension together
    x = K.reshape(x, (-1, input_dim))
    x = K.dot(x, w)
    if b is not None:
        x = K.bias_add(x, b)
    # reshape to 3D tensor
    if K.backend() == 'tensorflow':
        x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
        x.set_shape([None, None, output_dim])
    else:
        x = K.reshape(x, (-1, timesteps, output_dim))
    return x



tfPrint = lambda d, T: tf.Print(input_=T, data=[T, tf.shape(T)], message=d)

class AttentionDecoder(Recurrent):

    def __init__(self, units, output_dim,
                 activation='tanh',
                 return_probabilities=False,
                 name='AttentionDecoder',
                 kernel_initializer='glorot_uniform',
                 recurrent_initializer='orthogonal',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        """
        Implements an AttentionDecoder that takes in a sequence encoded by an
        encoder and outputs the decoded states
        :param units: dimension of the hidden state and the attention matrices
        :param output_dim: the number of labels in the output space

        references:
            Bahdanau, Dzmitry, Kyunghyun Cho, and Yoshua Bengio.
            "Neural machine translation by jointly learning to align and translate."
            arXiv preprint arXiv:1409.0473 (2014).
        """
        self.units = units
        self.output_dim = output_dim
        self.return_probabilities = return_probabilities
        self.activation = activations.get(activation)
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.recurrent_initializer = initializers.get(recurrent_initializer)
        self.bias_initializer = initializers.get(bias_initializer)

        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.recurrent_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)

        self.kernel_constraint = constraints.get(kernel_constraint)
        self.recurrent_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)

        super(AttentionDecoder, self).__init__(**kwargs)
        self.name = name
        self.return_sequences = True  # must return sequences

    def build(self, input_shape):
        """
          See Appendix 2 of Bahdanau 2014, arXiv:1409.0473
          for model details that correspond to the matrices here.
        """

        self.batch_size, self.timesteps, self.input_dim = input_shape

        if self.stateful:
            super(AttentionDecoder, self).reset_states()

        self.states = [None, None]  # y, s

        """
            Matrices for creating the context vector
        """

        self.V_a = self.add_weight(shape=(self.units,),
                                   name='V_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.W_a = self.add_weight(shape=(self.units, self.units),
                                   name='W_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.U_a = self.add_weight(shape=(self.input_dim, self.units),
                                   name='U_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.b_a = self.add_weight(shape=(self.units,),
                                   name='b_a',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the r (reset) gate
        """
        self.C_r = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_r = self.add_weight(shape=(self.units, self.units),
                                   name='U_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_r = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_r = self.add_weight(shape=(self.units, ),
                                   name='b_r',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)

        """
            Matrices for the z (update) gate
        """
        self.C_z = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_z = self.add_weight(shape=(self.units, self.units),
                                   name='U_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_z = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_z = self.add_weight(shape=(self.units, ),
                                   name='b_z',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the proposal
        """
        self.C_p = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_p = self.add_weight(shape=(self.units, self.units),
                                   name='U_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_p = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_p = self.add_weight(shape=(self.units, ),
                                   name='b_p',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for making the final prediction vector
        """
        self.C_o = self.add_weight(shape=(self.input_dim, self.output_dim),
                                   name='C_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_o = self.add_weight(shape=(self.units, self.output_dim),
                                   name='U_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_o = self.add_weight(shape=(self.output_dim, self.output_dim),
                                   name='W_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_o = self.add_weight(shape=(self.output_dim, ),
                                   name='b_o',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)

        # For creating the initial state:
        self.W_s = self.add_weight(shape=(self.input_dim, self.units),
                                   name='W_s',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)

        self.input_spec = [
            InputSpec(shape=(self.batch_size, self.timesteps, self.input_dim))]
        self.built = True

    def call(self, x):
        # store the whole sequence so we can "attend" to it at each timestep
        self.x_seq = x

        # apply the a dense layer over the time dimension of the sequence
        # do it here because it doesn't depend on any previous steps
        # thefore we can save computation time:
        self._uxpb = _time_distributed_dense(self.x_seq, self.U_a, b=self.b_a,
                                             input_dim=self.input_dim,
                                             timesteps=self.timesteps,
                                             output_dim=self.units)

        return super(AttentionDecoder, self).call(x)

    def get_initial_state(self, inputs):
        # apply the matrix on the first time step to get the initial s0.
        s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))

        # from keras.layers.recurrent to initialize a vector of (batchsize,
        # output_dim)
        y0 = K.zeros_like(inputs)  # (samples, timesteps, input_dims)
        y0 = K.sum(y0, axis=(1, 2))  # (samples, )
        y0 = K.expand_dims(y0)  # (samples, 1)
        y0 = K.tile(y0, [1, self.output_dim])

        return [y0, s0]

    def step(self, x, states):

        ytm, stm = states

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
        # ~~~> calculate new hidden state
        # first calculate the "r" gate:

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r)
            + K.dot(stm, self.U_r)
            + K.dot(context, self.C_r)
            + self.b_r)

        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z)
            + K.dot(stm, self.U_z)
            + K.dot(context, self.C_z)
            + self.b_z)

        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p)
            + K.dot((rt * stm), self.U_p)
            + K.dot(context, self.C_p)
            + self.b_p)

        # new hidden state:
        st = (1-zt)*stm + zt * s_tp

        yt = activations.softmax(
            K.dot(ytm, self.W_o)
            + K.dot(stm, self.U_o)
            + K.dot(context, self.C_o)
            + self.b_o)

        if self.return_probabilities:
            return at, [yt, st]
        else:
            return yt, [yt, st]

    def compute_output_shape(self, input_shape):
        """
            For Keras internal compatability checking
        """
        if self.return_probabilities:
            return (None, self.timesteps, self.timesteps)
        else:
            return (None, self.timesteps, self.output_dim)

    def get_config(self):
        """
            For rebuilding models on load time.
        """
        config = {
            'output_dim': self.output_dim,
            'units': self.units,
            'return_probabilities': self.return_probabilities
        }
        base_config = super(AttentionDecoder, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))