In [1]:
from tensorflow.keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Concatenate, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.preprocessing.sequence import pad_sequences
from BahdanauAttention import AttentionLayer
import numpy as np
import random
import eng_to_ipa as ipa
import pickle

In [2]:
class s2s_model:
    def __init__(self, max_encoder_len, max_decoder_len, build_inference_model_encoder_vocab, num_decoder_vocab):
        self.latent_dim = 256
        self.embedding_dim = 200
        self.max_encoder_len = max_encoder_len
        self.max_decoder_len = max_decoder_len
        self.num_encoder_vocab = num_encoder_vocab
        self.num_decoder_vocab = num_decoder_vocab
        
        self.build_encoder()
        self.build_decoder()
        
        self.training_model = Model([self.encoder_inputs, self.decoder_inputs], self.decoder_outputs)
        
    def build_encoder(self):
        self.encoder_inputs = Input(shape=(self.max_encoder_len, ))
        self.encoder_embed = Embedding(self.num_encoder_vocab, self.embedding_dim, trainable=True)(self.encoder_inputs)
        self.encoder_LSTM1 = LSTM(self.latent_dim, return_sequences=True, return_state=True, dropout = 0.4, recurrent_dropout = 0.3)
        self.encoder_output1, self.state_h1, self.state_c1 = self.encoder_LSTM1(self.encoder_embed)

        self.encoder_LSTM2 = LSTM(self.latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.3)
        self.encoder_output2, self.state_h2, self.state_c2 = self.encoder_LSTM2(self.encoder_output1) # encoder LSTMs feed into each other

        self.encoder_LSTM3 = LSTM(self.latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.3)
        self.encoder_output, self.state_h, self.state_c = self.encoder_LSTM3(self.encoder_output2) # final outputs and states to pass to decoder LSTM
        
    def build_decoder(self):
        self.decoder_inputs = Input(shape=(None,))

        # define layer architecture, then match to inputs
        self.decoder_embed_layer = Embedding(self.num_decoder_vocab, self.embedding_dim, trainable=True)
        self.decoder_embed = self.decoder_embed_layer(self.decoder_inputs)

        # decoder LSTM layer
        self.decoder_LSTM = LSTM(self.latent_dim, return_sequences=True, return_state= True, dropout=0.4, recurrent_dropout=0.2)
        self.decoder_outputs, self.decoder_fwd_state, self.decoder_back_state = self.decoder_LSTM(self.decoder_embed, initial_state=[self.state_h, self.state_c])

        # dense layer (output layer)
        # keras.layers.TimeDistributed layer considers temporal dimension
        # Every input should be at least 3D, and the dimension of index one of the first input will be considered to be the temporal dimension.
        self.decoder_dense = TimeDistributed(Dense(self.num_decoder_vocab, activation='softmax'))
        self.decoder_outputs = self.decoder_dense(self.decoder_outputs)
        
    def compile(self):
        self.training_model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics = ['acc'])
        
    def fit(self, x_tr, y_tr_in, y_tr_out, x_test, y_test_in, y_test_out, ep, batch_size):
        tb = TensorBoard(log_dir="logs/")
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
        ck = ModelCheckpoint(filepath='segmenter_best_weights.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        Callbacks = [es, ck]
        self.training_model.fit([x_tr,y_tr_in], y_tr_out, epochs = ep, callbacks=Callbacks, batch_size = batch_size, validation_data=(([x_test,y_test_in]), y_test_out))
    
    def build_inference_model(self):
        self.inference_encoder_model = Model(inputs= self.encoder_inputs, outputs=[self.encoder_output, self.state_h, self.state_c])

        self.inference_encoder_model.save('final_encoder_model_segmenter.h5')

        # decoder setup
        self.decoder_state_input_h = Input(shape=(self.latent_dim,))
        self.decoder_state_input_c = Input(shape=(self.latent_dim,))
        self.decoder_hidden_state_input = Input(shape=(self.max_encoder_len, self.latent_dim))

        self.decoder_embed_i = self.decoder_embed_layer(self.decoder_inputs)

        self.decoder_output_i, self.state_h_i, self.state_c_i = self.decoder_LSTM(self.decoder_embed_i, initial_state = [self.decoder_state_input_h, self.decoder_state_input_c])

        self.decoder_output_i = self.decoder_dense(self.decoder_output_i)

        # final decoder inference model
        self.inference_decoder_model = Model([self.decoder_inputs] + [self.decoder_hidden_state_input, self.decoder_state_input_h, self.decoder_state_input_c], [self.decoder_output_i] + [self.state_h_i, self.state_c_i])

        # save the final inference model
        self.inference_decoder_model.save('final_decoder_model_segmenter.h5')
        
    def decode_sequence(self, input_seq, i2o, o2i):
        e_out,e_h, e_c = self.inference_encoder_model.predict(input_seq, verbose = 0)
        target_seq = np.zeros((1,1))
        target_seq[0,0] = o2i['<']

        stop_condition = False
        decoded_sentence = []

        while not stop_condition:
            (output_tokens, h, c) = self.inference_decoder_model.predict([target_seq] + [e_out, e_h, e_c], verbose = 0)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_token = i2o[sampled_token_index]   

            if sampled_token != '>':
                decoded_sentence += [sampled_token]

            # Exit condition: either hit max length or find the stop word.
            if (sampled_token == '>') or (len(decoded_sentence) >= self.max_decoder_len):
                stop_condition = True

            # Update the target sequence (of length 1)
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

            # Update internal states
            (e_h, e_c) = (h, c)
        return decoded_sentence
    def word2seq(self, a2i, input_word):
        final_seq = []
        for c in input_word:
            final_seq += [a2i[c]]
        final_seq = pad_sequences([final_seq], maxlen=self.max_encoder_len, padding='post')[0]
        return final_seq
    
    def translate(self, input_word, a2i, i2o, o2i):
        seq = self.word2seq(a2i, input_word).reshape(1, self.max_encoder_len)
        return self.decode_sequence(seq, i2o, o2i)

In [3]:
class bi_model:
    def __init__(self, max_encoder_len, max_decoder_len, build_inference_model_encoder_vocab, num_decoder_vocab):
        self.latent_dim = 256
        self.embedding_dim = 200
        self.max_encoder_len = max_encoder_len
        self.max_decoder_len = max_decoder_len
        self.num_encoder_vocab = num_encoder_vocab
        self.num_decoder_vocab = num_decoder_vocab

        self.build_encoder()
        self.build_decoder()
        self.training_model = Model([self.encoder_inputs, self.decoder_inputs], self.decoder_outputs) 
        
    def build_encoder(self):
        self.encoder_inputs = Input(shape=(self.max_encoder_len,)) 
        # Embedding layer- i am using 1024 output-dim for embedding you can try diff values 100,256,512,1000
        self.enc_emb = Embedding(self.num_encoder_vocab, self.embedding_dim, trainable = True)(self.encoder_inputs)

        # Bidirectional lstm layer
        self.enc_lstm1 = Bidirectional(LSTM(self.latent_dim,return_sequences=True,return_state=True))
        self.encoder_outputs1, self.forw_state_h, self.forw_state_c, self.back_state_h, self.back_state_c = self.enc_lstm1(self.enc_emb)

        # Concatenate both h and c 
        self.final_enc_h = Concatenate()([self.forw_state_h,self.back_state_h])
        self.final_enc_c = Concatenate()([self.forw_state_c,self.back_state_c])

        # get Context vector
        self.encoder_states =[self.final_enc_h, self.final_enc_c]
    def build_decoder(self):
        self.decoder_inputs = Input(shape=(None,)) 

        # decoder embedding with same number as encoder embedding
        self.dec_emb_layer = Embedding(self.num_decoder_vocab, self.embedding_dim) 
        self.dec_emb = self.dec_emb_layer(self.decoder_inputs)   # apply this way because we need embedding layer for prediction 

        # In encoder we used Bidirectional so it's having two LSTM's so we have to take double units(256*2=512) for single decoder lstm
        # LSTM using encoder's final states as initial state
        self.decoder_lstm = LSTM(self.latent_dim*2, return_sequences=True, return_state=True) 
        self.decoder_outputs, _, _ = self.decoder_lstm(self.dec_emb, initial_state=self.encoder_states)

        # Using Attention Layer
        self.attention_layer = AttentionLayer()
        self.attention_result, self.attention_weights = self.attention_layer([self.encoder_outputs1, self.decoder_outputs])

        # Concat attention output and decoder LSTM output 
        self.decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([self.decoder_outputs, self.attention_result])

        # Dense layer with softmax
        self.decoder_dense = Dense(self.num_decoder_vocab, activation='softmax')
        self.decoder_outputs = self.decoder_dense(self.decoder_concat_input)
        
    def compile(self):
        self.training_model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics = ['acc'])
    
    def fit(self, x_tr, y_tr_in, y_tr_out, x_test, y_test_in, y_test_out, ep, batch_size):
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
        ck = ModelCheckpoint(filepath='segmenter_best_weights.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        Callbacks = [es, ck]
        self.training_model.fit([x_tr,y_tr_in], y_tr_out, epochs = ep, callbacks=Callbacks, batch_size = batch_size, validation_data=(([x_test,y_test_in]), y_test_out))

    def build_inference_model(self):
        self.encoder_model_inference = Model(self.encoder_inputs, outputs = [self.encoder_outputs1, self.final_enc_h, self.final_enc_c])
        self.encoder_model_inference.save('final_encoder_model_segmenter.h5')

        # Decoder Inference
        self.decoder_state_h = Input(shape=(self.latent_dim*2,)) # This numbers has to be same as units of lstm's on which model is trained
        self.decoder_state_c = Input(shape=(self.latent_dim*2,))

        # we need hidden state for attention layer
        # 36 is maximum length if english sentence It has to same as input taken by attention layer can see in model plot
        self.decoder_hidden_state_input = Input(shape=(self.max_encoder_len,self.latent_dim*2)) 
        # get decoder states
        self.dec_states = [self.decoder_state_h, self.decoder_state_c]

        # embedding layer 
        self.dec_emb2 = self.dec_emb_layer(self.decoder_inputs)
        self.decoder_outputs2, self.state_h2, self.state_c2 = self.decoder_lstm(self.dec_emb2, initial_state=self.dec_states)

        # Attention inference
        self.attention_result_inf, self.attention_weights_inf = self.attention_layer([self.decoder_hidden_state_input, self.decoder_outputs2])
        self.decoder_concat_input_inf = Concatenate(axis=-1, name='concat_layer')([self.decoder_outputs2, self.attention_result_inf])

        self.dec_states2= [self.state_h2, self.state_c2]
        self.decoder_outputs2 = self.decoder_dense(self.decoder_concat_input_inf)

        # get decoder model
        self.decoder_model_inference= Model(
                            [self.decoder_inputs] + [self.decoder_hidden_state_input, self.decoder_state_h, self.decoder_state_c],
                             [self.decoder_outputs2]+ self.dec_states2)
        self.decoder_model_inference.save('final_decoder_model_segmenter.h5')
        
    def decode_sequence(self, input_seq, i2o, o2i):
        e_out ,e_h, e_c = self.encoder_model_inference.predict(input_seq, verbose = 0)
        target_seq = np.zeros((1,1))
        target_seq[0,0] = o2i['<']

        stop_condition = False
        decoded_sentence = []

        while not stop_condition:
            (output_tokens, h, c) = self.decoder_model_inference.predict([target_seq] + [e_out, e_h, e_c], verbose = 0)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_token = i2o[sampled_token_index]   

            if sampled_token != '>':
                decoded_sentence += [sampled_token]

            # Exit condition: either hit max length or find the stop word.
            if (sampled_token == '>') or (len(decoded_sentence) >= self.max_decoder_len):
                stop_condition = True

            # Update the target sequence (of length 1)
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

            # Update internal states
            (e_h, e_c) = (h, c)
        return decoded_sentence
    def word2seq(self, a2i, input_word):
        final_seq = []
        for c in input_word:
            final_seq += [a2i[c]]
        final_seq = pad_sequences([final_seq], maxlen=self.max_encoder_len, padding='post')[0]
        return final_seq
    
    def translate(self, input_word, a2i, i2o, o2i):
        seq = self.word2seq(a2i, input_word).reshape(1, self.max_encoder_len)
        return self.decode_sequence(seq, i2o, o2i)

In [15]:
# prepare parameters

# fetch metadata
metadata_file = open('metadata.txt')
metadata = metadata_file.readlines()
metadata = [line.strip('\n') for line in metadata]
metadata = [line.split('\t') for line in metadata]
metadata = [line[-1] for line in metadata]

max_encoder_len = int(metadata[0])
max_decoder_len = int(metadata[1])
num_encoder_vocab = int(metadata[2])
num_decoder_vocab = int(metadata[3])

# fetch dictionaries
e2i_file = open('e2i.pkl', 'rb')
i2e_file = open('i2e.pkl', 'rb')
d2i_file = open('d2i.pkl', 'rb')
i2d_file = open('i2d.pkl', 'rb')

e2i = pickle.load(e2i_file)
i2e = pickle.load(i2e_file)
d2i = pickle.load(d2i_file)
i2d = pickle.load(i2d_file)

e2i_file.close()
i2e_file.close()
d2i_file.close()
i2d_file.close()
metadata_file.close()

In [21]:
# fetch training data
training_data_file = open('training_data.txt')
raw_data = training_data_file.readlines()
random.shuffle(raw_data)
raw_data = [line.strip('\n') for line in raw_data]
raw_split_data = [line.split('\t') for line in raw_data]
eng_words_list = [pair[0] for pair in raw_split_data]
syllabified_list = [pair[1] for pair in raw_split_data]

eng_words_list = np.array(eng_words_list).reshape((1,len(eng_words_list)))
syllabified_list = np.array(syllabified_list).reshape((1,len(syllabified_list)))

In [22]:
# create x_train data
x_tr = []

for word in eng_words_list[0]:
    int_seq = []
    for c in word:
        int_seq += [e2i[c]]
    x_tr += [int_seq]
x_tr = pad_sequences(x_tr, maxlen=max_encoder_len, padding='post')

In [23]:
# create y_train data
y_tr = []

for seq in syllabified_list[0]:
    int_seq = []
    for c in seq:
        int_seq += [d2i[c]]
    y_tr += [int_seq]
y_tr = pad_sequences(y_tr, maxlen=max_decoder_len, padding='post')

In [24]:
# split into training and validation sets
# x_novel = x_tr[10000:]
# y_novel = y_tr[10000:]

# x_tr = x_tr[:10000]
# y_tr = y_tr[:10000]

split_index = int(len(x_tr) * .8)

y_test = y_tr[split_index:]
y_test_in = y_test[:, :-1]
y_test_out = y_test[:, 1:]

y_tr = y_tr[:split_index]
y_tr_in = y_tr[:, :-1]
y_tr_out = y_tr[:, 1:]

x_test = x_tr[split_index:]
x_tr = x_tr[:split_index]

In [9]:
segmenter = bi_model(max_encoder_len, max_decoder_len, num_encoder_vocab, num_decoder_vocab)

In [None]:
segmenter.compile()
segmenter.fit(x_tr, y_tr_in, y_tr_out, x_test, y_test_in, y_test_out, ep=50, batch_size=128)

In [25]:
# resume training
segmenter.compile()
segmenter.training_model.load_weights('segmenter_best_weights.h5')
segmenter.fit(x_tr, y_tr_in, y_tr_out, x_test, y_test_in, y_test_out, ep=50, batch_size=128)
segmenter.build_inference_model()

Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.98491, saving model to segmenter_best_weights.h5
Epoch 2/50

Epoch 00002: val_acc improved from 0.98491 to 0.98626, saving model to segmenter_best_weights.h5
Epoch 3/50

Epoch 00003: val_acc improved from 0.98626 to 0.98712, saving model to segmenter_best_weights.h5
Epoch 4/50

Epoch 00004: val_acc improved from 0.98712 to 0.98983, saving model to segmenter_best_weights.h5
Epoch 5/50

Epoch 00005: val_acc improved from 0.98983 to 0.99074, saving model to segmenter_best_weights.h5
Epoch 6/50

Epoch 00006: val_acc improved from 0.99074 to 0.99188, saving model to segmenter_best_weights.h5
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.99188
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.99188
Epoch 00008: early stopping


In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/

In [26]:
word = ""
while(word != 'quit'):
    word = input("Enter a word to be segmented, or 'quit' to exit: ")
    if(word == 'quit'):
        print("Goodbye!")
        break
    if(in_data(word, x_tr)):
       print("Warning: Given word is in the original training dataset")
    print(seq2word(segmenter.translate(word, e2i, i2d, d2i)))

Enter a word to be segmented, or 'quit' to exit:  hugsy


hugs;y


Enter a word to be segmented, or 'quit' to exit:  modelifier


mod;e;lif;ier


Enter a word to be segmented, or 'quit' to exit:  charger


charg;er


Enter a word to be segmented, or 'quit' to exit:  kulipha


ku;liph;a


Enter a word to be segmented, or 'quit' to exit:  miphia


mi;phia


Enter a word to be segmented, or 'quit' to exit:  miphial


mi;phial


Enter a word to be segmented, or 'quit' to exit:  postdoctorate


post;doc;tor;ate


Enter a word to be segmented, or 'quit' to exit:  quit


Goodbye!


In [12]:
def in_data(word, data):
    if [word] in eng_words_list[0]:
        return True
    return False

In [13]:
def seq2word(array):
    final_string = ""
    for c in array:
        final_string += c
    return final_string