# Encoder-Transformer-Decoder
Encoder and decoder taken from 'e2e_enc_dec.py'
Transformer framework from 'TransformerEncDec-Unpolished.py'

In [1]:
#!usr/bin/python3
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras as keras
from keras.layers import Flatten, Dense, Reshape

import numpy as np
import pickle, random, string
import matplotlib.pyplot as plt
%matplotlib inline

# Visualization
from IPython.display import display



In [2]:
strategy = tf.distribute.OneDeviceStrategy('cpu:0') # Chooses what to run on: CPU or GPU - Can use GPU:0, GPU:1, ...

# Prepare input

In [3]:
corpus = open('../data/len5_10000-train.txt')#corpus = np.loadtxt(sys.argv[1], dtype=object)
corpus = np.loadtxt(corpus, dtype=object)

corpus_test = open('../data/len5_10000-test.txt')#corpus = np.loadtxt(sys.argv[1], dtype=object)
corpus_test = np.loadtxt(corpus_test, dtype=object)

trainingSet = open('../data/SG-10-train.txt')
testingSet  = open('../data/SG-10-test.txt')

trainingSet = np.loadtxt(trainingSet, dtype=str)
testingSet  = np.loadtxt(testingSet, dtype=str)

Length of sequences to model

In [4]:
length = 10
padded_length = 20

# Note we are making these the same, but they don't -have- to be!
input_length = padded_length
output_length = padded_length

# Vocabulary sizes...
encoder_vocab_size = 30 # blank, a, b, c, ... j (bc/SG-10)
decoder_vocab_size = 30 # blank, a, b, c, ... j, start, stop

Encode the words/letters using integers

In [5]:
# Import roles
#training_roles = np.loadtxt(trainingSet, dtype=object) # sys.argv[2]
#testing_roles  = np.loadtxt(testingSet, dtype=object)  # sys.argv[3]

Define a mapping function from characters to integers

In [6]:
def letter_to_int(char_array):
    # --- Create a dictionary for all the letters & start/stops ---
    alphabet = np.array([i for i in range(1, 31)]) # All letters plus STARTSENTENCE, STOPSENTENCE, start, stop
    mapping = dict()
    for i in range(len(alphabet) - 4):
        mapping[chr(ord('a') + i)] = alphabet[i]

    mapping['start'] = alphabet[26]
    mapping['stop']  = alphabet[27]
    mapping['STARTSETNENCE'] = alphabet[28]
    mapping['STOPSENTENCE']  = alphabet[29]
    
    # --- Map the characters in the input array to integers ---
    x_input = char_array
    x_input = [list(i) for i in x_input]
    X = []
    for word in x_input:
        X.append([mapping[sym] for sym in word])
        #print("word=", word)
    X = np.array(X)
    
    
    # --- Create Y, preY, postY ---
    Y = []
    for word in X:
        Y.append(np.concatenate((np.array([27]), word, np.array([28])), axis=0))

    Y = np.array(Y)
    preY  = Y[:, :-1]
    postY = Y[:, 1:]
    
    return X, Y, preY, postY, mapping

Define reverse map function

In [7]:
def int_to_letter(encoding, mapping):
    enc_shape = encoding.shape
    
    flat_encoding = encoding.flatten() # Flatten array to just one dimension
    
    # list out keys and values separately
    key_list = list(mapping.keys())
    val_list = list(mapping.values())

    integers = []
    for letter in flat_encoding:
        integers.append(key_list[val_list.index(letter)])
        
    integers = np.array(integers)
    integers = np.reshape(integers, enc_shape)
    return integers

In [None]:
X, Y, preY, postY, mapping = letter_to_int(corpus)

print(len(mapping))

print(int_to_letter(Y[0:5], mapping))

Select 10 random words from corpus and encode them

In [None]:
roles_to_corpus = {}
for role in string.ascii_lowercase[:10]:
    roles_to_corpus[role] = np.array([mapping[letter] for letter in random.choice(corpus)])

Create training and testing set

In [None]:
print("X.shape=", X.shape)
print("Y.shape=", Y.shape)
print("preY.shape=", preY.shape)
print("postY.shape=", postY.shape)

# Make transformer blocks

In [None]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                   key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="gelu"),
             keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate 

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'num_heads': self.num_heads, 'embed_dim': self.embed_dim,
            'ff_dim': self.ff_dim, 'rate': self.rate
        })
        return config

In [None]:
class MaskedTransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(MaskedTransformerBlock, self).__init__()
        self.att1 = keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                    key_dim=embed_dim)
        self.att2 = keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                    key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="gelu"),
             keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
        self.dropout3 = keras.layers.Dropout(rate)
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate 
        
    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        """Masks the upper half of the dot product matrix in self attention.
        This prevents flow of information from future tokens to current token.
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, inputs, training):
        input_shape = tf.shape(inputs[0])
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        mask = self.causal_attention_mask(batch_size,
                                         seq_len, seq_len,
                                         tf.bool)
        # mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        attn_output1 = self.att1(inputs[0], inputs[0],
                                 attention_mask = mask)
        attn_output1 = self.dropout1(attn_output1, training=training)
        out1 = self.layernorm1(inputs[0] + attn_output1)
        attn_output2 = self.att2(out1, inputs[1])
        attn_output2 = self.dropout2(attn_output2, training=training)
        out2 = self.layernorm1(out1 + attn_output2)
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        return self.layernorm2(out2 + ffn_output)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'num_heads': self.num_heads, 'embed_dim': self.embed_dim,
            'ff_dim': self.ff_dim, 'rate': self.rate
        })
        return config

In [None]:
class PositionEmbedding(keras.layers.Layer):
    def __init__(self,maxlen,embed_dim):
        super(PositionEmbedding, self).__init__()
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen,
                                              output_dim=embed_dim)
        self.maxlen = maxlen
        self.embed_dim = embed_dim
        
    def call(self,x):
        maxlen = tf.shape(x)[1]
        print(maxlen)
        positions = tf.range(start=0,limit=maxlen,delta=1)
        positions = self.pos_emb(positions)
        print(tf.shape(positions))
        print(tf.shape(x))
        return x + positions
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'maxlen': self.maxlen,
            'embed_dim': self.embed_dim 
        })
        return config

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def MaskedSparseCategoricalCrossentropy(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def MaskedSparseCategoricalAccuracy(real, pred):
    accuracies = tf.equal(tf.cast(real,tf.int64), tf.argmax(pred, axis=2))
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)
    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [None]:
class MaskedTokenAndPositionEmbedding(keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(MaskedTokenAndPositionEmbedding, self).__init__()
        self.token_emb = keras.layers.Embedding(input_dim=vocab_size,
                                                output_dim=embed_dim,
                                                mask_zero=True)
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen+1,
                                              output_dim=embed_dim,
                                              mask_zero=True)
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=1, limit=maxlen+1, delta=1)
        positions = positions * tf.cast(tf.sign(x),tf.int32)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'maxlen': self.maxlen,
            'vocab_size': self.vocab_size,
            'embed_dim': self.embed_dim
        })
        return config

# Model parameters

In [None]:
# Size of the gestalt, context representations...
input_length  = 5+1 # Length of each word?
output_length = 5+1 # Length of word plus start/stop?

HIDDEN_SIZE = 1024 # 1024 in Blake's
BATCH_SIZE  = 100
EPOCHS      = 5

# Create encoder/decoder 
Encoder

In [None]:
embed_dim = 300  # Embedding size for each token (orig. 32)
num_heads = 4  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer (orig. 32)
stack = 1
wd = 0.01

# Make the layers
encoder_input = keras.layers.Input(shape=(None,)) # Orig. (None,)

# encoder_embedding = keras.layers.Embedding(
#                     input_dim=len(mapping), output_dim=embed_dim)(encoder_input)

# encoder_pos_embedding = PositionEmbedding(
#                         maxlen=input_length,
#                         embed_dim=embed_dim)(encoder_embedding)

encoder_embedding = MaskedTokenAndPositionEmbedding(maxlen = input_length, vocab_size = len(mapping), embed_dim = embed_dim)(encoder_input)

model = keras.Model(encoder_input, encoder_embedding)

In [None]:
x = encoder_embedding
for _ in range(stack):
    x = TransformerBlock(embed_dim=embed_dim,
                         num_heads=num_heads,
                         ff_dim=ff_dim)(x)
encoder_state = x

Decoder Construction

In [None]:
# Set up the decoder blocks, using `encoder_state` as initial state.
decoder_input = keras.layers.Input(shape=(None,)) # Orig. (None,)

# decoder_embedding = keras.layers.Embedding(
#                     input_dim=len(mapping), output_dim=embed_dim)(decoder_input)

# decoder_pos_embedding = PositionEmbedding(
#                         maxlen=output_length,
#                         embed_dim=embed_dim)(decoder_embedding)
    
    
decoder_embedding = MaskedTokenAndPositionEmbedding(maxlen = output_length, vocab_size = len(mapping), embed_dim = embed_dim)(decoder_input)

x = decoder_embedding
decoder_blocks = []
for _ in range(stack):
    decoder_blocks += [MaskedTransformerBlock(embed_dim=embed_dim,
                                                  num_heads=num_heads,
                                                  ff_dim=ff_dim)]
    x = decoder_blocks[-1]([x,encoder_state])

decoder_dense = keras.layers.Dense(decoder_vocab_size,
                                       activation='softmax') # Orig. decoder vocab size
decoder_output = decoder_dense(x)

Assemble model and compile it

In [None]:
model = keras.Model([encoder_input, decoder_input],
                         decoder_output)

In [None]:
model.compile(loss=MaskedSparseCategoricalCrossentropy,
                  optimizer=tfa.optimizers.AdamW(weight_decay=wd,learning_rate=.001), #.0001
                  metrics=[MaskedSparseCategoricalAccuracy])
model.summary()
    
# Visualization - wish we could see the recurrent weights!
keras.utils.plot_model(model,to_file='EncTransformDec.png',
                       show_shapes=True,expand_nested=True)

In [None]:
history = model.fit([X,preY], postY,
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              verbose=0)

In [None]:
print('Accuracy:',model.evaluate([X,preY],postY)[1]*100.0,'%')

In [None]:
plt.figure(1)  
# summarize history for accuracy 
plt.subplot(211)  
plt.plot(history.history['MaskedSparseCategoricalAccuracy'])  
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
# summarize history for loss  
plt.subplot(212)  
plt.plot(history.history['loss'])  
plt.title('model loss')  
plt.ylabel('loss')  
plt.xlabel('epoch')  
plt.tight_layout()
plt.show()  

In [None]:
# Make just a model out of the encoder
# input = encoder_input (Input layer)
# output = encoder_states (enc Hidden layer * 2)
encoder_model = keras.Model(encoder_input, encoder_state)

# Part 2 - make the decoder

# Make just a model out of the decoder
x = decoder_embedding
encoder_state_input = keras.layers.Input(shape=(None,embed_dim))
for i in range(stack):
    x = decoder_blocks[i]([x,encoder_state_input])
decoder_output = decoder_dense(x)
decoder_model = keras.Model([decoder_input, encoder_state_input],
                             decoder_output)

# try to pass things through the outer model 

In [None]:
# Separated Decoder Model
keras.utils.plot_model(decoder_model,
                       show_shapes=True,expand_nested=True)

In [None]:
#with teacher forcing
i = 0 
context = encoder_model.predict(X[i:i+1])
context.shape

In [None]:
result = np.argmax(decoder_model.predict([preY[i:i+1,:],context]),-1)

In [None]:
result

In [None]:
#decodeseq

In [None]:
#without teacher forcing 

In [None]:
result = preY[0:1,0:1]
result

In [None]:
context = encoder_model.predict(X[i:i+1])
result = np.zeros_like(preY[i:i+1])
result[0:1,0:1] = preY[i:i+1,0:1] # Start only...
for j in range(output_length): #ouput length is 5+1 
    tokens = np.argmax(decoder_model.predict([result,context]),-1)
    result[0:1,j+1:j+2] = tokens[0:1, j:j+1]
result = tokens # Remove start token
result

In [None]:
context.shape 

In [None]:
preY[i:i+1,0:1]

In [None]:
i=0
context = encoder_model.predict(X[i:i+3])
result = np.zeros_like(preY[i:i+3])
result[:,0:1] = preY[i:i+3,0:1] # Start only...
for j in range(output_length): #ouput length is 5+1 
    tokens = np.argmax(decoder_model.predict([result,context]),-1)
    result[:,j+1:j+2] = tokens[:, j:j+1]
result = tokens # Remove start token
result

In [None]:
postY[i:i+3]

In [None]:
#doesn't work 
context = encoder_model.predict(X[i:i+1])
result = preY[0:1,0:1] # Start only...
for i in range(output_length):
    tokens = np.argmax(decoder_model.predict([result,context]),-1)
    result = np.hstack([result,tokens[:,-1:]])
result = np.int32(result[:,1:]) # Remove start token
print(result[0])
#decode_seq(result[0],itos)

In [None]:
result

In [None]:
postY[i]

In [None]:
model.predict([X[i:i+1], preY[i:i+1]]).argmax(axis=-1)

# Export Model (commented out so it doesn't make a new one)

In [None]:
# if (True):
#     encoder_model_json = encoder_model.to_json()
#     decoder_model_json = decoder_model.to_json()
#     with open("models/encoder_len53.json", "w") as encoder_file, open("models/decoder_len53.json", "w") as decoder_file:
#         encoder_file.write(encoder_model_json)
#         decoder_file.write(decoder_model_json)
#     encoder_model.save_weights("models/encoder_len53.h5")
#     decoder_model.save_weights("models/decoder_len53.h5")

# Try to link it with the nested enc dec

In [None]:
corpus

In [None]:
outer_encoder = encoder_model
outer_decoder = decoder_model

In [None]:
keras.utils.plot_model(outer_encoder, show_shapes=True)
keras.utils.plot_model(outer_decoder, show_shapes=True)

In [None]:
keras.utils.plot_model(model,show_shapes=True)

In [None]:
temp,_,_,_,_ = letter_to_int(trainingSet)

In [None]:
temp[0]

In [None]:
#[x for x in temp]

In [None]:
len(corpus)

In [None]:
#sample 10 indices from the coprus WOR
corIdx = np.random.randint(0,high=len(corpus),size=20)
#I could probably just do a np array of size 10000 
corIdx = np.random.choice(corIdx,size=10,replace=False)

outer_x = X[corIdx]
outer_preY = preY[corIdx]
outer_postY = postY[corIdx]

outer_embeddings = outer_encoder.predict(outer_x)

trainingSet_int = letter_to_int(trainingSet)[0]

inner_x = np.array([outer_embeddings[trainingSet_int[x]-1] for x in range(len(trainingSet))]) #200 comes from the trainingSet 

dog = np.zeros((1,5,300))
dog.shape
np.concatenate((dog, inner_x[0]))

#make inner_postY and outer_postY
dog = np.zeros((1,5,300))
inner_preY = np.array([np.concatenate((dog,inner_x[i])) for i in range(len(inner_x))])
inner_postY = np.array([np.concatenate((inner_x[i],dog)) for i in range(len(inner_x))])

In [None]:
corpus[corIdx]

In [None]:
trainingSet[0]

In [None]:
trainingSet_int[0]

In [None]:
outer_embeddings[0]

In [None]:
inner_x[0]

In [None]:
outer_x[trainingSet_int[0]-1]

In [None]:
corpus[9676]

In [None]:
corIdx

In [None]:
outer_x.shape

In [None]:
inner_x.shape

In [None]:
corIdx

In [None]:
#inner encoder construction 
hidden_size = 300
inner_encoder_input = keras.layers.Input(shape=(None,) + inner_x.shape[2:], name="inner_encoder_input")
#reshape layer here
inner_encoder_reshape = keras.layers.Reshape((-1,1500))(inner_encoder_input)
inner_encoder_hidden = keras.layers.LSTM(hidden_size, return_state=True, name="inner_encoder") #pass layers in here

In [None]:
encoder_output, enc_state_h, enc_state_c = inner_encoder_hidden(inner_encoder_reshape)
encoder_states = [enc_state_h, enc_state_c]

In [None]:
inner_preY.shape

In [None]:
#inner decoder construction
inner_decoder_input_1 = keras.layers.Input(shape=(None,) + inner_preY.shape[2:], name="inner_dec_token_1")
inner_decoder_reshape = keras.layers.Reshape((-1,1500))(inner_decoder_input_1)
inner_decoder_input_2 = keras.layers.Input(shape=(None, 2), name="dec_start/stop")
inner_decoder_concat = keras.layers.Concatenate()([inner_decoder_reshape,inner_decoder_input_2])

inner_decoder_hidden = keras.layers.LSTM(hidden_size,return_sequences=True,return_state=True,name="inner_decoder") #need initial state for h and c

In [None]:
#tie it together 
decoder_hidden_output, decoder_state_h, decoder_state_c = inner_decoder_hidden(inner_decoder_concat,
                                                                         initial_state=encoder_states)

In [None]:
inner_postY.shape

In [None]:
decoder_dense_t1 = keras.layers.Dense(inner_encoder_reshape.shape[-1], activation='linear', name="token_1")(decoder_hidden_output)
inner_output_reshape = keras.layers.Reshape((-1,inner_postY.shape[2],inner_postY.shape[3]))(decoder_dense_t1)
decoder_dense_t2 = keras.layers.Dense(2, activation='sigmoid', name="start/stop")(decoder_hidden_output)

In [None]:
inner_decoder_output = [inner_output_reshape, decoder_dense_t2]

In [None]:
inner_model = keras.Model([inner_encoder_input,inner_decoder_input_1, inner_decoder_input_2],
                         inner_decoder_output)

In [None]:
#make the pre start/stop which will have dimensions (200,4,2)
#pre_start = np.array([[1,0,0,0] for x in range(0,200)])
#is it just the exact same as Blake's
s_s = {"start":[1,0],"stop":[0,1], "none":[0,0]}
pre_start = np.zeros((inner_x.shape[0], 4, 2))
post_stop = np.zeros((inner_x.shape[0], 4, 2))
pre_start[:,0,:] = s_s["start"]
post_stop[:,3,:] = s_s["stop"]

In [None]:
post_stop.shape

In [None]:
inner_model.compile(loss = [keras.losses.MSE,keras.losses.binary_crossentropy],
               optimizer=keras.optimizers.Nadam(),
               metrics=['accuracy'])

In [None]:
#inner_model_input = {"inner_encoder_input":inner_x,"inner_enc_token_1":inner_preY,3                     "dec_start/stop":pre_start}

In [None]:
inner_model_input = {"input_1":inner_x,"input_2":inner_preY,
                     "dec_start/stop":pre_start}

In [None]:
inner_model_target = {"inner_dec_token_1":inner_postY,"start/stop": post_stop}

In [None]:
inner_x.shape

In [None]:
keras.utils.plot_model(inner_model,show_shapes=True)

In [None]:
#try to train it 
inner_batch_size = 25
inner_epochs = 40
inner_history = inner_model.fit([inner_x,inner_preY, pre_start], [inner_postY,post_stop],
                         batch_size=inner_batch_size,
                         epochs=inner_epochs,
                         verbose=0)

In [None]:
inner_history.history.keys()

In [None]:
plt.figure(1)  
# summarize history for accuracy 
plt.subplot(211)  
plt.plot(inner_history.history['reshape_2_accuracy'])  
plt.title('model accuracy')  
plt.ylabel('reshape_2_accuracy')  
plt.xlabel('epoch')  
# summarize history for loss  
plt.subplot(212)  
plt.plot(inner_history.history['loss'])  
plt.title('model loss')  
plt.ylabel('loss')  
plt.xlabel('epoch')  
plt.tight_layout()
plt.show()  

In [None]:
accuracy = inner_model.evaluate([inner_x,inner_preY, pre_start], [inner_postY,post_stop])

In [None]:
#inner_model.predict([inner_x[0:1],inner_preY[0:1],pre_start[0:1]])[0]

In [None]:
#inner_postY[0:1] 

In [None]:
#np.sum(keras.losses.MSE(y_pred=inner_model.predict([inner_x[0:1],inner_preY[0:1],pre_start[0:1]])[0], y_true= inner_postY[0:1] ))

# Remove teacher forcing

In [None]:
#encoder 
inner_encoder_model = keras.Model(inner_encoder_input, encoder_states)

In [None]:
keras.utils.plot_model(inner_encoder_model, show_shapes=True)

In [None]:
#Decoder 
inner_decoder_state_input_h = keras.layers.Input(shape=(hidden_size,),
                                                name='inner_states_input_h')

inner_decoder_state_input_c = keras.layers.Input(shape=(hidden_size,),
                                                name='inner_states_input_c')

In [None]:
# Connect hidden to input(s)
inner_decoder_states_input = [inner_decoder_state_input_h,
                             inner_decoder_state_input_c]

In [None]:
inner_decoder_hidden_output, inner_decoder_state_h, inner_decoder_state_c = inner_decoder_hidden(inner_decoder_concat,
              initial_state=inner_decoder_states_input)


In [None]:
inner_decoder_hidden_output

In [None]:
inner_decoder_states = [inner_decoder_state_h, inner_decoder_state_c]

In [None]:
inner_decoder_states

In [None]:
decoder_dense_t1 = keras.layers.Dense(inner_encoder_reshape.shape[-1], activation='linear', name="token_1")
#inner_output_reshape = keras.layers.Reshape((-1,inner_postY.shape[2],inner_postY.shape[3]))(decoder_dense_t1)
decoder_dense_t2 = keras.layers.Dense(2, activation='sigmoid', name="start/stop")

In [None]:
inner_output_reshape_2 = keras.layers.Reshape((-1,inner_postY.shape[2],inner_postY.shape[3]))(decoder_dense_t1(inner_decoder_hidden_output))

In [None]:
# Connect output to hidden(s)
inner_decoder_output = [inner_output_reshape_2, decoder_dense_t2(inner_decoder_hidden_output)] 

In [None]:
inner_decoder_output + inner_decoder_states

In [None]:
#need to fix
inner_decoder_model = keras.Model([inner_decoder_input_1, inner_decoder_input_2] + inner_decoder_states_input,
                                 inner_decoder_output + inner_decoder_states)

In [None]:
keras.utils.plot_model(inner_decoder_model,to_file="decouple_inner_model.png",show_shapes=True)

In [None]:
embedding = inner_encoder_model.predict(inner_x[0:1])
embedding[1].shape

In [None]:
inner_preY[0:1].shape

In [None]:
teachOut = inner_decoder_model.predict([inner_preY[0:1], pre_start[0:1]] + embedding)

In [None]:
teachOut[1]

In [None]:
post_stop[0:1]

In [None]:
teachOut[0].shape

In [None]:
inner_postY[0:1].shape

In [None]:
#without teacher forcing 
input_tokens = np.zeros_like(inner_preY[0:1])
input_tokens.shape

In [None]:
output_tokens = inner_decoder_model.predict([input_tokens, pre_start[0:1]] + embedding)

In [None]:
output_tokens[0].shape

In [None]:
output_tokens[0][:,0:1,:,:].shape #replace on input_tokens timestep one 

In [None]:
input_tokens[:,1:2,:,:] = output_tokens[0][:,0:1,:,:]

In [None]:
output_tokens = inner_decoder_model.predict([input_tokens, pre_start[0:1]] + embedding)

In [None]:
#get the context
embedding = inner_encoder_model.predict(inner_x[0:1]) 
#without teacher forcing

input_tokens = np.zeros_like(inner_preY[0:1])

input_start = np.zeros_like(pre_start[0:1])

input_start[0,0:1,:] = pre_start[0,0:1,:]

for i in range(0,3):
    output_tokens = inner_decoder_model.predict([input_tokens, input_start] + embedding)
    
    input_tokens[:,i+1:i+2,:,:] = output_tokens[0][:,i:i+1,:,:] #replace on input_tokens timestep
    input_start[:,i+1:i+2,:] = output_tokens[1][:,i:i+1,:]
    embedding = output_tokens[2:4]
        

In [None]:
#with teacher forcing 
#output_tokens = inner_model.predict([inner_x[0:1], inner_preY[0:1], pre_start[0:1]])

In [None]:
# Make just a model out of the decoder
"""
x = decoder_embedding
encoder_state_input = keras.layers.Input(shape=(None,embed_dim))
for i in range(stack):
    x = decoder_blocks[i]([x,encoder_state_input])
decoder_output = decoder_dense(x)
decoder_model = keras.Model([decoder_input, encoder_state_input],
                             decoder_output)
"""

In [None]:
output_tokens[1].shape

In [None]:
input_tokens.shape

In [None]:
keras.utils.plot_model(decoder_model, show_shapes=True)

In [None]:
type(output_tokens)

In [None]:
result = pre_start[0:1] #start 
result

# Plug inner decoder's output into outer decoder

In [None]:
output_tokens = [inner_postY[0:1], post_stop[0:1]]

for k in range(0,3):
    context = output_tokens[0][0][k:k+1,:,:]
    i=0
    result = np.zeros_like(preY[i:i+1])
    result[0:1,0:1] = preY[i:i+1,0:1] # Start only...

    for j in range(output_length): #ouput length is 5+1 
        tokens = np.argmax(outer_decoder.predict([result,context]),-1)
        result[0:1,j+1:j+2] = tokens[0:1, j:j+1]
    result = tokens # Remove start token
    print(int_to_letter(result, mapping))

In [None]:
int_to_letter(outer_x[trainingSet_int[0]-1], mapping)

In [None]:
trainingSet[0]

In [None]:
corIdx[8]

In [None]:
corpus[corIdx]

In [None]:
outer_postY[trainingSet_int[0]-1]

# Check the ouput against postY

In [None]:
inner_x[0:1].shape

In [None]:
outer_postY

do this three times in a loop and ouput is the non teacher foced input, generate out put tokens then output tokens need to be passed into the
outer decoder. take the (5,300) input and then feed it in one at a time non-teacher forced (provide as the embedding for the outer). We need to unfold for the 6 timesteps. produce at the end 6 integers. Use MSE and binary cross entropy for stiched together inner model.  

# Testing 