In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [557]:
import os
files = []
for i in os.listdir('data/'):
    if i.endswith('.txt'):
        files.append(i)



In [561]:
files[-1]

'port.txt'

In [574]:
df = pd.DataFrame(columns=['dialogue'])
for i in files[-1:]:
    print(i)
    df_loop = pd.read_table('data/'+i, sep="\n", header=None)
    df_loop = df_loop.rename(columns={0:'dialogue'})
    df = df.append(df_loop)


port.txt


In [575]:
df = df.reset_index(drop=True)

In [611]:
df = pd.read_table('data/ai.txt', sep="\n", header=None)

In [612]:
df = df.rename(columns={0:'dialogue'})

In [613]:
import re

def delete_blank(x):
    if x=='':
        return None
    else:
        return x
df.dialogue = df.dialogue.map(delete_blank)

In [614]:
df = df.dropna().reset_index(drop=True)

In [615]:
def delete_parentheses(x):
    return re.sub(r'\(.*\)|\[.*\]', '', x)


In [616]:
df.dialogue = df.dialogue.map(lambda x: delete_parentheses(x))
df.dialogue = df.dialogue.map(delete_blank)
df = df.dropna().reset_index(drop=True)

In [617]:
def separate_punctuation(x):
#     x = re.sub(r'\.',' . ', x)
#     x = re.sub(r'\,',' , ', x)
#     x = re.sub(r'\!',' ! ', x)
#     x = re.sub(r'\?',' ? ', x)
#     x = re.sub(r'\:',' : ', x) 
    x = re.sub(r'\.',' ', x)
    x = re.sub(r'\,',' ', x)
    x = re.sub(r'\!',' ', x)
    x = re.sub(r'\?',' ', x)
#     x = re.sub(r'\:',' ', x) 
    return x

def delete_large_spaces(x):
    return re.sub(r'\s{2,}', ' ', x)

In [618]:
df.dialogue = df.dialogue.map(lambda x: separate_punctuation(x))

In [619]:
df.dialogue = df.dialogue.str.lower()

In [620]:
def change_words(x):
    x = re.sub("let's", 'let us', x)
    x = re.sub("c'mon", 'come on', x)
    x = re.sub("there's", 'there is', x)
    x = re.sub("you're", 'you are', x)
    x = re.sub("we're", 'we are', x)
    x = re.sub("i'm", 'i am', x)
    x = re.sub("y'", 'you', x)
    x = re.sub("how'd", 'how did', x)
    x = re.sub("\'ll", ' will', x)
    x = re.sub("\'t", ' not', x)
    x = re.sub("\'s", '  is', x)
    x = re.sub("\'re", '  are', x)
    x = re.sub("\'", ' ', x)
    x = re.sub('\"', ' ', x)
    x = re.sub('-', ' ', x)
    
    return x

In [621]:
df.dialogue = df.dialogue.map(change_words)

In [622]:
def delete_no_dialogue(x):
    if ':' in x:
        return x
    

In [623]:
df.head()

Unnamed: 0,dialogue
0,what is ai
1,artificial intelligence is the branch of e...
2,what is ai
3,ai is the field of science which concerns ...
4,are you sentient


In [624]:
# df.dialogue = df.dialogue.map(delete_no_dialogue)


In [625]:
df = df.dropna()

In [626]:
def delete_names(x):
    x = re.sub(r'[a-z]{2,}:','', x)
    return x

def delete_semicol(x):
    x = re.sub(r'\:','', x)
    return x

In [627]:
df.dialogue = df.dialogue.map(delete_names)
df.dialogue = df.dialogue.map(delete_large_spaces)


In [628]:
df = df.dropna().reset_index(drop=True)
df.dialogue = df.dialogue.map(delete_semicol)

# Open glove and tokenize-pad

In [160]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print( "Done.",len(model)," words loaded!")
    return model

In [161]:
model_glove = loadGloveModel('glove/glove.6B.100d.txt')

Loading Glove Model
Done. 400000  words loaded!


In [24]:
glove_df = pd.DataFrame(model_glove).T

unk_pad_df = pd.DataFrame(columns=glove_df.columns)
unk_pad_df.loc['<PAD>'] = np.zeros(glove_df.shape[1])
unk_pad_df.loc['<UNK>'] = glove_df.mean()

glove_unk_df = pd.concat([unk_pad_df,glove_df])

#### for memory reasons, use just the vocabulary from friends

In [630]:
top_words = 5000

script_string = list(df.as_matrix().flatten())
most_common_words = pd.Series(''.join(script_string).split()).value_counts().head(top_words)
vocab_friends = pd.Series(most_common_words.index)
vocab_friends = pd.Series(['<PAD>', '<UNK>']).append(vocab_friends)


In [631]:
glove_unk_friends_df = glove_unk_df[glove_unk_df.index.isin(vocab_friends)]
voc_df = pd.DataFrame(glove_unk_friends_df.index, columns=['voc']).reset_index()
voc_df.voc = voc_df.voc.str.lower()
voc_df = voc_df.set_index('voc')
voc_dic = voc_df.to_dict()['index']

In [632]:
class_weight = pd.Series(''.join(script_string).split()).value_counts().head(top_words)
class_weight.index = class_weight.index.map(voc_dic)
class_weight_dic = class_weight.to_dict()

### Tokenize and pad

In [661]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


class TokenizerCustom(Tokenizer):
    def __init__(self, voc, max_len=12, *args, **kwargs):
        super(TokenizerCustom, self).__init__(*args, **kwargs)
        self.max_len = max_len
        self.word_index = voc
        self.oov_token = '<unk>'
        self.filters = '#$%&()*+-/<=>@[\]^_`{|}~.,'
    def pad_string(self, x):
        return pad_sequences(x, maxlen=self.max_len)
    
    def tokenize_string(self, x):
        tok_str = self.texts_to_sequences(pd.Series(x).values)
        return self.pad_string(tok_str)[0]

        

In [662]:
# test unk
tc = TokenizerCustom(voc=voc_dic, oov_token=voc_dic['<unk>'])
tc.tokenize_string("monica odpsk")

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int32)

##### tokenize data

In [635]:
df_tok = df.copy()
df_tok.dialogue = df_tok.dialogue.map(lambda x: tc.tokenize_string(x))


In [636]:
df_tok_long = pd.DataFrame([df_tok.dialogue[:len(df_tok)-1], df_tok.shift(-1).dialogue[:len(df_tok)-1]]).T
df_tok_long.columns = ['dialogue','answer']

In [637]:
df_tok_long = df_tok_long[df_tok_long.dialogue.map(lambda x: 0 not in x)]
df_tok_long = df_tok_long[df_tok_long.answer.map(lambda x: 0 not in x)]

In [638]:
[0,2,0]

[0, 2, 0]

# Train model

In [668]:
# X = df_tok.dialogue[:len(df_tok)-1]
X = df.dialogue.iloc[0::2]
X = X.map(lambda x: tc.tokenize_string(x))

X = np.array(X.tolist())

In [669]:
y = df.dialogue.iloc[1::2]
y = y.map(lambda x: tc.tokenize_string(x))
# y = df_tok.shift(-1).dialogue[:len(df_tok)-1]
# y = np.array(y.tolist())


In [672]:
max_seq_len = len(X[0])
emb_dim = glove_unk_friends_df.shape[1]
vocab_dim = glove_unk_friends_df.shape[0]

In [673]:
from keras.utils import to_categorical

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist, dtype=np.uint8)
    print(y.shape)
#     y = y.reshape(y.shape[0], vocab_size, y.shape[1])
    print(y.shape)
    return y

y_enc = encode_output(y, vocab_dim)

(105, 12, 395)
(105, 12, 395)


In [674]:
y_enc_copy = y_enc

In [675]:
# y_enc = y_enc_copy.reshape(y_enc_copy.shape[0], vocab_dim, y_enc_copy.shape[1])

In [684]:
from keras.models import Model
from keras.layers import Embedding, Input, Dense, LSTM, Dropout, RepeatVector, Flatten, Activation, Permute, Reshape
from keras.layers.wrappers import TimeDistributed

from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, SGD
from keras import regularizers



inp = Input(shape=(max_seq_len,))
emb = Embedding(vocab_dim, emb_dim, weights=[glove_unk_friends_df], 
                input_length=max_seq_len, trainable=False, mask_zero=False)(inp)


lstm_in = LSTM(500, dropout=0.0)(emb)
rep_vec = RepeatVector(max_seq_len)(lstm_in)
rep_vec = TimeDistributed(Dense(vocab_dim, activation='softmax'))(rep_vec)
lstm_out = LSTM(500, dropout=0.0, return_sequences=True)(rep_vec)
out = TimeDistributed(Dense(vocab_dim, activation='softmax'))(lstm_out)


# out = AttentionDecoder(150, vocab_dim)(lstm_in)


In [685]:
# from keras.layers import multiply

# lstm_in = LSTM(500, dropout=0.0)(emb)
# rep_vec = RepeatVector(max_seq_len)(lstm_in)
# rep_vec = TimeDistributed(Dense(vocab_dim, activation='softmax'))(rep_vec)

# lstm_in_2 = LSTM(500, dropout=0.0)(emb)
# rep_vec_2 = RepeatVector(max_seq_len)(lstm_in)
# rep_vec_2 = TimeDistributed(Dense(vocab_dim, activation='relu'))(rep_vec_2)


# rep_vec = multiply([rep_vec, rep_vec_2])
# lstm_out = LSTM(500, dropout=0.0, return_sequences=True)(rep_vec)
# out = TimeDistributed(Dense(vocab_dim, activation='softmax'))(lstm_out)

In [704]:
from keras.models import Sequential

model = Sequential()
model.add( Embedding(vocab_dim, emb_dim, weights=[glove_unk_friends_df], 
                input_length=max_seq_len, trainable=False, mask_zero=False))
model.add(LSTM(200, return_sequences=True))
model.add(LSTM(200, return_sequences=True))
if True:
    model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(vocab_dim)))
model.add(Activation('softmax'))

In [705]:
from keras.callbacks import ReduceLROnPlateau

earlystop = EarlyStopping(monitor='acc', min_delta=0.001, patience=5)
reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.2,
                              patience=5, min_lr=0.00001, verbose=1)
model = Model(inputs=inp, outputs=out)
# model = Model(inputs=inp, outputs=out)
# model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.0001), metrics=['accuracy'])
# model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.00001), metrics=['accuracy'], sample_weight_mode='temporal')
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.01), metrics=['accuracy'], sample_weight_mode='temporal')


In [706]:
# from keras.utils.vis_utils import plot_model

# plot_model(model, show_shapes=True, show_layer_names=True)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_38 (InputLayer)        (None, 12)                0         
_________________________________________________________________
embedding_38 (Embedding)     (None, 12, 100)           39500     
_________________________________________________________________
lstm_115 (LSTM)              (None, 500)               1202000   
_________________________________________________________________
repeat_vector_58 (RepeatVect (None, 12, 500)           0         
_________________________________________________________________
time_distributed_97 (TimeDis (None, 12, 395)           197895    
_________________________________________________________________
lstm_116 (LSTM)              (None, 12, 500)           1792000   
_________________________________________________________________
time_distributed_98 (TimeDis (None, 12, 395)           197895    
Total para

In [707]:
# model.fit(X, np.array(y.tolist()), epochs = 10, callbacks=[earlystop], batch_size=10)
model.fit(X, y_enc, epochs = 10, callbacks=[earlystop, reduce_lr])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


KeyboardInterrupt: 

In [702]:
voc_dic_inv = {voc_dic[x]:x for x in voc_dic}

result = model.predict(X[100].reshape(1, X.shape[1]))[0]
result_df = pd.DataFrame(result).idxmax(axis=1, skipna=True)
result_df.map(voc_dic_inv)

0     <pad>
1     <pad>
2     <pad>
3     <pad>
4     <pad>
5     <pad>
6     <pad>
7     <pad>
8     <pad>
9     <pad>
10    <pad>
11    <pad>
dtype: object

In [510]:
pd.Series(X[100]).map(voc_dic_inv)

0     grab
1        a
2    spoon
dtype: object

In [193]:
glove_unk_friends_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
<PAD>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<UNK>,0.052098,-0.097114,-0.138076,0.110753,-0.027228,-0.003264,0.031764,-0.050769,0.153216,-0.023674,...,0.016906,0.075768,0.075961,-0.108,0.208303,-0.078413,0.086636,0.123813,-0.234345,-0.009255
the,-0.038194,-0.24487,0.72812,-0.39961,0.083172,0.043953,-0.39141,0.3344,-0.57545,0.087459,...,0.016215,-0.017099,-0.38984,0.87424,-0.72569,-0.51058,-0.52028,-0.1459,0.8278,0.27062
of,-0.1529,-0.24279,0.89837,0.16996,0.53516,0.48784,-0.58826,-0.17982,-1.3581,0.42541,...,0.18712,-0.018488,-0.26757,0.727,-0.59363,-0.34839,-0.56094,-0.591,1.0039,0.20664
to,-0.1897,0.050024,0.19084,-0.049184,-0.089737,0.21006,-0.54952,0.098377,-0.20135,0.34241,...,-0.13134,0.058617,-0.31869,-0.61419,-0.62393,-0.41548,-0.038175,-0.39804,0.47647,-0.15983


In [310]:
import tensorflow as tf
from keras import backend as K
from keras import regularizers, constraints, initializers, activations
from keras.layers.recurrent import Recurrent
from keras.engine import InputSpec


import keras.backend as K


def _time_distributed_dense(x, w, b=None, dropout=None,
                            input_dim=None, output_dim=None,
                            timesteps=None, training=None):
    """Apply `y . w + b` for every temporal slice y of x.
    # Arguments
        x: input tensor.
        w: weight matrix.
        b: optional bias vector.
        dropout: wether to apply dropout (same dropout mask
            for every temporal slice of the input).
        input_dim: integer; optional dimensionality of the input.
        output_dim: integer; optional dimensionality of the output.
        timesteps: integer; optional number of timesteps.
        training: training phase tensor or boolean.
    # Returns
        Output tensor.
    """
    if not input_dim:
        input_dim = K.shape(x)[2]
    if not timesteps:
        timesteps = K.shape(x)[1]
    if not output_dim:
        output_dim = K.shape(w)[1]

    if dropout is not None and 0. < dropout < 1.:
        # apply the same dropout pattern at every timestep
        ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
        dropout_matrix = K.dropout(ones, dropout)
        expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
        x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)

    # collapse time dimension and batch dimension together
    x = K.reshape(x, (-1, input_dim))
    x = K.dot(x, w)
    if b is not None:
        x = K.bias_add(x, b)
    # reshape to 3D tensor
    if K.backend() == 'tensorflow':
        x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
        x.set_shape([None, None, output_dim])
    else:
        x = K.reshape(x, (-1, timesteps, output_dim))
    return x



tfPrint = lambda d, T: tf.Print(input_=T, data=[T, tf.shape(T)], message=d)

class AttentionDecoder(Recurrent):

    def __init__(self, units, output_dim,
                 activation='tanh',
                 return_probabilities=False,
                 name='AttentionDecoder',
                 kernel_initializer='glorot_uniform',
                 recurrent_initializer='orthogonal',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        """
        Implements an AttentionDecoder that takes in a sequence encoded by an
        encoder and outputs the decoded states
        :param units: dimension of the hidden state and the attention matrices
        :param output_dim: the number of labels in the output space

        references:
            Bahdanau, Dzmitry, Kyunghyun Cho, and Yoshua Bengio.
            "Neural machine translation by jointly learning to align and translate."
            arXiv preprint arXiv:1409.0473 (2014).
        """
        self.units = units
        self.output_dim = output_dim
        self.return_probabilities = return_probabilities
        self.activation = activations.get(activation)
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.recurrent_initializer = initializers.get(recurrent_initializer)
        self.bias_initializer = initializers.get(bias_initializer)

        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.recurrent_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)

        self.kernel_constraint = constraints.get(kernel_constraint)
        self.recurrent_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)

        super(AttentionDecoder, self).__init__(**kwargs)
        self.name = name
        self.return_sequences = True  # must return sequences

    def build(self, input_shape):
        """
          See Appendix 2 of Bahdanau 2014, arXiv:1409.0473
          for model details that correspond to the matrices here.
        """

        self.batch_size, self.timesteps, self.input_dim = input_shape

        if self.stateful:
            super(AttentionDecoder, self).reset_states()

        self.states = [None, None]  # y, s

        """
            Matrices for creating the context vector
        """

        self.V_a = self.add_weight(shape=(self.units,),
                                   name='V_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.W_a = self.add_weight(shape=(self.units, self.units),
                                   name='W_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.U_a = self.add_weight(shape=(self.input_dim, self.units),
                                   name='U_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.b_a = self.add_weight(shape=(self.units,),
                                   name='b_a',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the r (reset) gate
        """
        self.C_r = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_r = self.add_weight(shape=(self.units, self.units),
                                   name='U_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_r = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_r = self.add_weight(shape=(self.units, ),
                                   name='b_r',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)

        """
            Matrices for the z (update) gate
        """
        self.C_z = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_z = self.add_weight(shape=(self.units, self.units),
                                   name='U_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_z = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_z = self.add_weight(shape=(self.units, ),
                                   name='b_z',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the proposal
        """
        self.C_p = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_p = self.add_weight(shape=(self.units, self.units),
                                   name='U_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_p = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_p = self.add_weight(shape=(self.units, ),
                                   name='b_p',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for making the final prediction vector
        """
        self.C_o = self.add_weight(shape=(self.input_dim, self.output_dim),
                                   name='C_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_o = self.add_weight(shape=(self.units, self.output_dim),
                                   name='U_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_o = self.add_weight(shape=(self.output_dim, self.output_dim),
                                   name='W_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_o = self.add_weight(shape=(self.output_dim, ),
                                   name='b_o',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)

        # For creating the initial state:
        self.W_s = self.add_weight(shape=(self.input_dim, self.units),
                                   name='W_s',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)

        self.input_spec = [
            InputSpec(shape=(self.batch_size, self.timesteps, self.input_dim))]
        self.built = True

    def call(self, x):
        # store the whole sequence so we can "attend" to it at each timestep
        self.x_seq = x

        # apply the a dense layer over the time dimension of the sequence
        # do it here because it doesn't depend on any previous steps
        # thefore we can save computation time:
        self._uxpb = _time_distributed_dense(self.x_seq, self.U_a, b=self.b_a,
                                             input_dim=self.input_dim,
                                             timesteps=self.timesteps,
                                             output_dim=self.units)

        return super(AttentionDecoder, self).call(x)

    def get_initial_state(self, inputs):
        # apply the matrix on the first time step to get the initial s0.
        s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))

        # from keras.layers.recurrent to initialize a vector of (batchsize,
        # output_dim)
        y0 = K.zeros_like(inputs)  # (samples, timesteps, input_dims)
        y0 = K.sum(y0, axis=(1, 2))  # (samples, )
        y0 = K.expand_dims(y0)  # (samples, 1)
        y0 = K.tile(y0, [1, self.output_dim])

        return [y0, s0]

    def step(self, x, states):

        ytm, stm = states

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
        # ~~~> calculate new hidden state
        # first calculate the "r" gate:

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r)
            + K.dot(stm, self.U_r)
            + K.dot(context, self.C_r)
            + self.b_r)

        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z)
            + K.dot(stm, self.U_z)
            + K.dot(context, self.C_z)
            + self.b_z)

        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p)
            + K.dot((rt * stm), self.U_p)
            + K.dot(context, self.C_p)
            + self.b_p)

        # new hidden state:
        st = (1-zt)*stm + zt * s_tp

        yt = activations.softmax(
            K.dot(ytm, self.W_o)
            + K.dot(stm, self.U_o)
            + K.dot(context, self.C_o)
            + self.b_o)

        if self.return_probabilities:
            return at, [yt, st]
        else:
            return yt, [yt, st]

    def compute_output_shape(self, input_shape):
        """
            For Keras internal compatability checking
        """
        if self.return_probabilities:
            return (None, self.timesteps, self.timesteps)
        else:
            return (None, self.timesteps, self.output_dim)

    def get_config(self):
        """
            For rebuilding models on load time.
        """
        config = {
            'output_dim': self.output_dim,
            'units': self.units,
            'return_probabilities': self.return_probabilities
        }
        base_config = super(AttentionDecoder, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))