In [22]:
from tensorflow.keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Concatenate, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.preprocessing.sequence import pad_sequences
from BahdanauAttention import AttentionLayer
import numpy as np
import random
import eng_to_ipa as ipa
import pickle

In [23]:
class bi_model:
    def __init__(self, max_encoder_len, max_decoder_len, build_inference_model_encoder_vocab, num_decoder_vocab):
        self.latent_dim = 256
        self.embedding_dim = 200
        self.max_encoder_len = max_encoder_len
        self.max_decoder_len = max_decoder_len
        self.num_encoder_vocab = num_encoder_vocab
        self.num_decoder_vocab = num_decoder_vocab

        self.build_encoder()
        self.build_decoder()
        self.training_model = Model([self.encoder_inputs, self.decoder_inputs], self.decoder_outputs) 
        
    def build_encoder(self):
        self.encoder_inputs = Input(shape=(self.max_encoder_len,)) 
        # Embedding layer- i am using 1024 output-dim for embedding you can try diff values 100,256,512,1000
        self.enc_emb = Embedding(self.num_encoder_vocab, self.embedding_dim, trainable = True)(self.encoder_inputs)

        # Bidirectional lstm layer
        self.enc_lstm1 = Bidirectional(LSTM(self.latent_dim,return_sequences=True,return_state=True))
        self.encoder_outputs1, self.forw_state_h, self.forw_state_c, self.back_state_h, self.back_state_c = self.enc_lstm1(self.enc_emb)

        # Concatenate both h and c 
        self.final_enc_h = Concatenate()([self.forw_state_h,self.back_state_h])
        self.final_enc_c = Concatenate()([self.forw_state_c,self.back_state_c])

        # get Context vector
        self.encoder_states =[self.final_enc_h, self.final_enc_c]
    def build_decoder(self):
        self.decoder_inputs = Input(shape=(None,)) 

        # decoder embedding with same number as encoder embedding
        self.dec_emb_layer = Embedding(self.num_decoder_vocab, self.embedding_dim) 
        self.dec_emb = self.dec_emb_layer(self.decoder_inputs)   # apply this way because we need embedding layer for prediction 

        # In encoder we used Bidirectional so it's having two LSTM's so we have to take double units(256*2=512) for single decoder lstm
        # LSTM using encoder's final states as initial state
        self.decoder_lstm = LSTM(self.latent_dim*2, return_sequences=True, return_state=True) 
        self.decoder_outputs, _, _ = self.decoder_lstm(self.dec_emb, initial_state=self.encoder_states)

        # Using Attention Layer
        self.attention_layer = AttentionLayer()
        self.attention_result, self.attention_weights = self.attention_layer([self.encoder_outputs1, self.decoder_outputs])

        # Concat attention output and decoder LSTM output 
        self.decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([self.decoder_outputs, self.attention_result])

        # Dense layer with softmax
        self.decoder_dense = Dense(self.num_decoder_vocab, activation='softmax')
        self.decoder_outputs = self.decoder_dense(self.decoder_concat_input)
        
    def compile(self):
        self.training_model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics = ['acc'])
    
    def fit(self, x_tr, y_tr_in, y_tr_out, x_test, y_test_in, y_test_out, ep, batch_size):
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
        ck = ModelCheckpoint(filepath='syllable_translator_best_weights.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        Callbacks = [es, ck]
        self.training_model.fit([x_tr,y_tr_in], y_tr_out, epochs = ep, callbacks=Callbacks, batch_size = batch_size, validation_data=(([x_test,y_test_in]), y_test_out))

    def build_inference_model(self):
        self.encoder_model_inference = Model(self.encoder_inputs, outputs = [self.encoder_outputs1, self.final_enc_h, self.final_enc_c])
        
        self.encoder_model_inference.save('s2p_encoder_inference_model.h5')
        
        # Decoder Inference
        self.decoder_state_h = Input(shape=(self.latent_dim*2,)) # This numbers has to be same as units of lstm's on which model is trained
        self.decoder_state_c = Input(shape=(self.latent_dim*2,))

        # we need hidden state for attention layer
        # 36 is maximum length if english sentence It has to same as input taken by attention layer can see in model plot
        self.decoder_hidden_state_input = Input(shape=(self.max_encoder_len,self.latent_dim*2)) 
        # get decoder states
        self.dec_states = [self.decoder_state_h, self.decoder_state_c]

        # embedding layer 
        self.dec_emb2 = self.dec_emb_layer(self.decoder_inputs)
        self.decoder_outputs2, self.state_h2, self.state_c2 = self.decoder_lstm(self.dec_emb2, initial_state=self.dec_states)

        # Attention inference
        self.attention_result_inf, self.attention_weights_inf = self.attention_layer([self.decoder_hidden_state_input, self.decoder_outputs2])
        self.decoder_concat_input_inf = Concatenate(axis=-1, name='concat_layer')([self.decoder_outputs2, self.attention_result_inf])

        self.dec_states2= [self.state_h2, self.state_c2]
        self.decoder_outputs2 = self.decoder_dense(self.decoder_concat_input_inf)

        # get decoder model
        self.decoder_model_inference= Model(
                            [self.decoder_inputs] + [self.decoder_hidden_state_input, self.decoder_state_h, self.decoder_state_c],
                             [self.decoder_outputs2]+ self.dec_states2)
        self.decoder_model_inference.save('s2p_decoder_inference_model.h5')
        
    def decode_sequence(self, input_seq, i2o, o2i):
        e_out ,e_h, e_c = self.encoder_model_inference.predict(input_seq, verbose = 0)
        target_seq = np.zeros((1,1))
        target_seq[0,0] = o2i['<']

        stop_condition = False
        decoded_sentence = []

        while not stop_condition:
            (output_tokens, h, c) = self.decoder_model_inference.predict([target_seq] + [e_out, e_h, e_c], verbose = 0)

            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_token = i2o[sampled_token_index]   

            if sampled_token != '>':
                decoded_sentence += [sampled_token]

            # Exit condition: either hit max length or find the stop word.
            if (sampled_token == '>') or (len(decoded_sentence) >= self.max_decoder_len):
                stop_condition = True

            # Update the target sequence (of length 1)
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

            # Update internal states
            (e_h, e_c) = (h, c)
        return decoded_sentence
    def word2seq(self, a2i, input_word):
        final_seq = []
        for c in input_word:
            final_seq += [a2i[c]]
        final_seq = pad_sequences([final_seq], maxlen=self.max_encoder_len, padding='post')[0]
        return final_seq
    
    def translate(self, input_word, a2i, i2o, o2i):
        seq = self.word2seq(a2i, input_word).reshape(1, self.max_encoder_len)
        return self.decode_sequence(seq, i2o, o2i)

In [24]:
# prepare parameters

# fetch metadata
metadata_file = open('metadata_ipa.txt')
metadata = metadata_file.readlines()
metadata = [line.strip('\n') for line in metadata]
metadata = [line.split('\t') for line in metadata]
metadata = [line[-1] for line in metadata]

max_encoder_len = int(metadata[0])
max_decoder_len = int(metadata[1])
num_encoder_vocab = int(metadata[2])
num_decoder_vocab = int(metadata[3])

# fetch dictionaries
e2i_file = open('ipa_e2i.pkl', 'rb')
i2e_file = open('ipa_i2e.pkl', 'rb')
d2i_file = open('ipa_d2i.pkl', 'rb')
i2d_file = open('ipa_i2d.pkl', 'rb')

e2i = pickle.load(e2i_file)
i2e = pickle.load(i2e_file)
d2i = pickle.load(d2i_file)
i2d = pickle.load(i2d_file)

e2i_file.close()
i2e_file.close()
d2i_file.close()
i2d_file.close()
metadata_file.close()

In [25]:
# fetch training data
training_data_file = open('training_data_ipa.txt', encoding = 'UTF-8')
raw_data = training_data_file.readlines()
raw_data = [line.strip('\n') for line in raw_data]
raw_split_data = [line.split('\t') for line in raw_data]
syllable_list = [pair[0] for pair in raw_split_data]
ipa_list = [pair[1] for pair in raw_split_data]

syllable_list = np.array(syllable_list).reshape((1,len(syllable_list)))
ipa_list = np.array(ipa_list).reshape((1,len(ipa_list)))

In [26]:
# create x_train data
x_tr = []

for syl in syllable_list[0]:
    int_seq = []
    for c in syl:
        int_seq += [e2i[c]]
    x_tr += [int_seq]
x_tr = pad_sequences(x_tr, maxlen=max_encoder_len, padding='post')

In [27]:
# create y_train data
y_tr = []

for seq in ipa_list[0]:
    int_seq = []
    for c in seq:
        int_seq += [d2i[c]]
    y_tr += [int_seq]
y_tr = pad_sequences(y_tr, maxlen=max_decoder_len, padding='post')

In [28]:
# split into training and validation sets
# x_novel = x_tr[5000:]
# y_novel = y_tr[5000:]

# x_tr = x_tr[:5000]
# y_tr = y_tr[:5000]

split_index = int(len(x_tr) * .8)

y_test = y_tr[split_index:]
y_test_in = y_test[:, :-1]
y_test_out = y_test[:, 1:]

y_tr = y_tr[:split_index]
y_tr_in = y_tr[:, :-1]
y_tr_out = y_tr[:, 1:]

x_test = x_tr[split_index:]
x_tr = x_tr[:split_index]

In [37]:
print(len(x_tr))

3914


In [29]:
syllable_translator = bi_model(max_encoder_len, max_decoder_len, num_encoder_vocab, num_decoder_vocab)

In [30]:
syllable_translator.compile()
syllable_translator.fit(x_tr, y_tr_in, y_tr_out, x_test, y_test_in, y_test_out, ep=50, batch_size=128)

Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.71336, saving model to syllable_translator_best_weights.h5
Epoch 2/50

Epoch 00002: val_acc improved from 0.71336 to 0.77988, saving model to syllable_translator_best_weights.h5
Epoch 3/50

Epoch 00003: val_acc did not improve from 0.77988
Epoch 4/50

Epoch 00004: val_acc improved from 0.77988 to 0.84704, saving model to syllable_translator_best_weights.h5
Epoch 5/50

Epoch 00005: val_acc improved from 0.84704 to 0.87909, saving model to syllable_translator_best_weights.h5
Epoch 6/50

Epoch 00006: val_acc improved from 0.87909 to 0.91081, saving model to syllable_translator_best_weights.h5
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.91081
Epoch 8/50

Epoch 00008: val_acc improved from 0.91081 to 0.95365, saving model to syllable_translator_best_weights.h5
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.95365
Epoch 10/50

Epoch 00010: val_acc improved from 0.95365 to 0.95870, saving model to syllable_translator

In [36]:
# resume training
# segmenter.compile()
syllable_translator.training_model.load_weights('syllable_translator_best_weights.h5')
# segmenter.fit(x_tr, y_tr_in, y_tr_out, x_test, y_test_in, y_test_out, ep=50, batch_size=128)
syllable_translator.build_inference_model()



In [35]:
word = ""
while(word != 'quit'):
    word = input("Enter a syllable to be translated, or 'quit' to exit: ")
    if(word == 'quit'):
        print("Goodbye!")
        break
    if(in_data(word, x_tr)):
       print("Warning: Given syllable is in the original training dataset")
    print(seq2word(syllable_translator.translate(word, e2i, i2d, d2i)))

Enter a syllable to be translated, or 'quit' to exit:  sus


səs


Enter a syllable to be translated, or 'quit' to exit:  ses


sɛs


Enter a syllable to be translated, or 'quit' to exit:  ing


ɪŋ


Enter a syllable to be translated, or 'quit' to exit:  kint


nɪnt


Enter a syllable to be translated, or 'quit' to exit:  nint


nɪnt


Enter a syllable to be translated, or 'quit' to exit:  grint


grɪnt


Enter a syllable to be translated, or 'quit' to exit:  ung


əŋ


Enter a syllable to be translated, or 'quit' to exit:  aigh


eɪg


Enter a syllable to be translated, or 'quit' to exit:  aight


eɪt


Enter a syllable to be translated, or 'quit' to exit:  ight


aɪt


Enter a syllable to be translated, or 'quit' to exit:  ment


mɛnt


Enter a syllable to be translated, or 'quit' to exit:  pin


pɪn


Enter a syllable to be translated, or 'quit' to exit:  quit


Goodbye!


In [34]:
def in_data(word, data):
    if [word] in syllable_list[0]:
        return True
    return False

In [21]:
def seq2word(array):
    final_string = ""
    for c in array:
        final_string += c
    return final_string