# 1. Load Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from math import cos, sin, pi
from tqdm import tqdm

#algorithms
import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras import Model
from keras.layers import *
from keras.layers import Activation,Conv2D,MaxPooling2D,UpSampling2D,Dense,BatchNormalization,Input,Reshape,multiply,add,Dropout,AveragePooling2D,GlobalAveragePooling2D,concatenate
from keras.layers.convolutional import Conv2DTranspose
from keras.models import Model  
import keras.backend as K
from keras.regularizers import l2
from keras.engine import Layer,InputSpec
from keras.utils import conv_utils

# 2. Defining trainer

In [2]:
def Encoder_decoder_trainer(max_len, K, latent_dim = 1000, dim_embed = 100):
    #training encoder 
    encoder_input = Input(shape = (max_len,), name = 'encoder_input')
    enc_emb = Embedding(input_dim = K, output_dim = dim_embed, name = 'encoder_embedding')(encoder_input)
    m_x = GlobalAveragePooling1D(name = 'm_x')(enc_emb)
    h_enc = GRU(latent_dim, activation = 'tanh', name = 'encoder_GRU')(enc_emb) # returns only last step output (dimension of n_hidden_units

    #training decoder
    decoder_input = Input(shape = (max_len,), name = 'decoder_input')
    dec_emb = Embedding(input_dim = K, output_dim = dim_embed, name = 'decoder_embedding')(decoder_input)
    s_dec = GRU(latent_dim, activation = 'tanh', return_sequences=True, name = 'decoder_GRU')(dec_emb, initial_state = h_enc) # returns outputs of each step (dim of max_len * n_hidden_units)
    O_h = TimeDistributed(Dense(2*latent_dim, activation = 'linear', name = 'O_h'))(s_dec) 
    O_y = TimeDistributed(Dense(2*latent_dim, activation = 'linear', name = 'O_y'))(dec_emb)
    O_c = Dense(2*latent_dim, activation = 'linear', name = 'O_c')(h_enc)
    O_m = Dense(2*latent_dim, activation = 'linear', name = 'O_m')(m_x)
    s = Add()([O_h, O_y, O_c, O_m]) # add them all together
    s = Reshape([max_len, 2*latent_dim, 1])(s)
    s = MaxPool2D(pool_size=(1,2), strides=(1,2))(s) # max-out activation
    s = Reshape([max_len, latent_dim])(s) # dimension of max_len * n_hidden_units
    output = TimeDistributed(Dense(K, activation = 'softmax'))(s) #dimension of max_len * K(vocab_size) -> probability distribution of each words
    
    return Model(inputs = [encoder_input, decoder_input], outputs = output)

In [17]:
trainer = Encoder_decoder_trainer(10, 100000)
trainer.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, 10)]         0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, 10)]         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 10, 100)      10000000    encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, 10, 100)      10000000    decoder_input[0][0]              
____________________________________________________________________________________________

# 3. Model Training

## 3.1. Loading and Preprocessing data

In [None]:
# Traing parameters of each layers

## 3.2. Training

# 4. Define Encoder and Decoder for inference

In [18]:
#encoder for inference
def encoder(trainer, max_len):
    encoder_embedding_layer = trainer.layers[2]
    encoder_GRU = trainer.layers[4]
    
    input = Input(shape = (max_len,))
    enc_emb = encoder_embedding_layer(input)
    h_enc = encoder_GRU(enc_emb)
    m_x = GlobalAveragePooling1D()(enc_emb)
    
    return Model(inputs = input, outputs = [h_enc, m_x])

In [19]:
infer_encoder = encoder(trainer, max_len = 10)
infer_encoder.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 10)]         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 10, 100)      10000000    input_11[0][0]                   
__________________________________________________________________________________________________
encoder_GRU (GRU)               (None, 1000)         3306000     encoder_embedding[1][0]          
__________________________________________________________________________________________________
global_average_pooling1d_2 (Glo (None, 100)          0           encoder_embedding[1][0]          
Total params: 13,306,000
Trainable params: 13,306,000
Non-trainable params: 0
______________

In [67]:
#decoder for inference

def decoder(trainer, latent_dim, dim_embed):
    decoder_embedding_layer = trainer.layers[3]
    decoder_GRU = trainer.layers[5]
    O_h_dense = trainer.layers[7]
    O_y_dense = trainer.layers[8]
    O_c_dense = trainer.layers[9]
    O_m_dense = trainer.layers[10]
    output_dense = trainer.layers[15]

    h_enc = Input(shape = (latent_dim))
    m_x = Input(shape = (dim_embed))
    decoder_input = Input(shape = (1,))
    decoder_initial_state = Input(shape=(latent_dim,)) #will be 
    dec_emb = decoder_embedding_layer(decoder_input)
    hidden_state = decoder_GRU(dec_emb, initial_state = decoder_initial_state)
    O_h = O_h_dense(hidden_state)
    O_y = O_y_dense(dec_emb)
    O_c = O_c_dense(h_enc)
    O_m = O_m_dense(m_x)
    s = Add()([O_h, O_y, O_c, O_m]) # add them all together
    s = Reshape([1, 2*latent_dim, 1])(s)
    s = MaxPool2D(pool_size=(1,2), strides=(1,2))(s) # max-out activation
    s = Reshape([1, latent_dim])(s) # dimension of max_len * n_hidden_units
    output = output_dense(s)
    hidden_state = Reshape([1000])(hidden_state)

    return Model(inputs = [h_enc, m_x, decoder_input, decoder_initial_state], outputs = [output, hidden_state])

In [68]:
infer_decoder = decoder(trainer, latent_dim = 1000, dim_embed = 100)
infer_decoder.summary()

Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_38 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
decoder_embedding (Embedding)   multiple             10000000    input_38[0][0]                   
__________________________________________________________________________________________________
input_39 (InputLayer)           [(None, 1000)]       0                                            
__________________________________________________________________________________________________
decoder_GRU (GRU)               multiple             3306000     decoder_embedding[7][0]          
                                                                 input_39[0][0]            

# 5. Sentence translator given source sentence tokens

In [22]:
def generate_sentence(source_sentence, encoder = infer_encoder, decoder = infer_decoder, max_length = 30):
    '''
        source_sentence : array shape of (1, max_len)
        encoder, decoder : predictive model
        max_length : int
    '''
    sample_tokens = [0]
    length = 0
    h_enc, m_x = encoder.predict(source_sentence)
    terminate_condition = False
    while terminate_condition == False:
        if length == 0:
            output, hs = decoder.predict([h_enc, m_x, np.array([sample_tokens[-1]]), h_enc])
        else:
            output, hs = decoder.predict([h_enc, m_x, np.array([sample_tokens[-1]]), hs])
        next_token = np.argmax(output)
        sample_tokens.append(next_token)
        length += 1

        if sample_tokens[-1] == 0 or length > max_length:
            terminate_condition = True

    return sample_tokens

In [66]:
source_sentence = np.array([[10, 1, 20, 40, 6000, 400, 50000, 15000, 0, 0]])
generate_sentence(source_sentence, infer_encoder, infer_decoder, max_length = 10)

[0,
 54506,
 29638,
 86782,
 88338,
 11509,
 15430,
 79068,
 97911,
 47687,
 36436,
 17620]