In [1]:
from tensorflow import keras
import numpy as np
import random

In [2]:
# potential issue with current dataset and training: onset + rime pair
# what about cases of words that are only rime?
# should it be considered to be a blank onset + rime or just rime

# if there exists a rime whose pronunciation depends on the type or presence of onset, then blank onset is required

# prep and clean data
# matrix with width of three
# index 0 = word
# index 1 = onset
# index 2 = rime
word_file = open("split_words.txt")

word_list = word_file.readlines()
word_list = [word.strip("\n") for word in word_list]
random.shuffle(word_list)

eng_list = np.empty(shape=(0,1))
or_list = np.empty(shape=(0,4))

for sets in word_list:
    word, onset, rime = sets.split('\t')
    eng_list = np.vstack((eng_list, np.array([word])))
    or_list = np.vstack((or_list, np.array(['<', onset, rime, '>'])))

    
# pairs = list(zip(eng_list,or_list))
# random.shuffle(pairs)

In [3]:
# create dictionaries
# first dictionary tokenizes full word into alphabetic character
alpha_vocab = []
onset_rime_vocab = []

# this attempt, we  won't distinguish onsets and times categorically


for word in eng_list:
    for c in word:
        if c not in alpha_vocab:
            alpha_vocab += c
            
alpha_vocab = sorted(set(alpha_vocab))
for e in or_list:
    for c in e:
        if c not in onset_rime_vocab:
            onset_rime_vocab += [c]

onset_rime_vocab = sorted(set(onset_rime_vocab))

alpha_to_int = dict((a,i) for i,a in enumerate(alpha_vocab, 1))
or_to_int = dict((a,i) for i,a in enumerate(onset_rime_vocab, 1))

int_to_alpha = dict((i,a) for i,a in enumerate(alpha_vocab, 1))
int_to_or = dict((i,a) for i,a in enumerate(onset_rime_vocab, 1))

In [4]:
# get max len of encoder words

arr_of_words = []

for word in eng_list:
    arr_of_words += [word[0]]
    
max_encoder_len = len(max(arr_of_words, key=len))
max_decoder_len = 4
num_encoder_vocab = len(alpha_vocab) + 1
num_decoder_vocab = len(onset_rime_vocab) + 1

In [5]:
# create x_train data
x_tr = []
for word in eng_list:
    int_seq = []
    for c in word[0]:
        int_seq += [alpha_to_int[c]]
    x_tr += [int_seq]
x_tr = keras.preprocessing.sequence.pad_sequences(x_tr, maxlen=max_encoder_len, padding='post')

In [6]:
# create y_train data
y_tr = []
for seq in or_list:
    int_seq = []
    for c in seq:
        int_seq += [or_to_int[c]]
    y_tr += [int_seq]
y_tr = keras.preprocessing.sequence.pad_sequences(y_tr, maxlen=max_decoder_len, padding='post')

In [7]:
# # split_data
# split_index = int(len(word_list) * .9)

# y_test = y_tr[split_index:]
# y_test_in = y_test[:, :-1]
# y_test_out = y_test[:, 1:]

# y_tr = y_tr[:split_index]
y_tr_in = y_tr[:, :-1]
y_tr_out = y_tr[:, 1:]

# x_test = x_tr[split_index:]
# x_tr = x_tr[:split_index]

In [8]:
latent_dim = 256
embedding_dim = 200

# three LSTM encoder model

# define the encoder model
encoder_inputs = keras.layers.Input(shape=(max_encoder_len, ))
encoder_embed = keras.layers.Embedding(num_encoder_vocab, embedding_dim, trainable=True)(encoder_inputs)

# first encoder LSTM
encoder_LSTM1 = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True, dropout = 0.4, recurrent_dropout = 0.4)
encoder_output1, state_h1, state_c1 = encoder_LSTM1(encoder_embed)

# second encoder LSTM
encoder_LSTM2 = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_LSTM2(encoder_output1) # encoder LSTMs feed into each other

# third encoder LSTM
encoder_LSTM3 = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output, state_h, state_c = encoder_LSTM3(encoder_output2) # final outputs and states to pass to decoder LSTM

In [9]:
# decoder LSTM model
# input layer -> decoder embedding layer -> one LSTM layer -> Dense

decoder_inputs = keras.layers.Input(shape=(None,))

# define layer architecture, then match to inputs
decoder_embed_layer = keras.layers.Embedding(num_decoder_vocab, embedding_dim, trainable=True)
decoder_embed = decoder_embed_layer(decoder_inputs)

# decoder LSTM layer
decoder_LSTM = keras.layers.LSTM(latent_dim, return_sequences=True, return_state= True, dropout=0.4, recurrent_dropout=0.2)
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_LSTM(decoder_embed, initial_state=[state_h, state_c])

# dense layer (output layer)
# keras.layers.TimeDistributed layer considers temporal dimension
# Every input should be at least 3D, and the dimension of index one of the first input will be considered to be the temporal dimension.
decoder_dense = keras.layers.TimeDistributed(keras.layers.Dense(num_decoder_vocab, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

model = keras.models.Model([encoder_inputs,decoder_inputs], decoder_outputs)

In [10]:
# compile model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics = ['acc'])


Callbacks = [keras.callbacks.ModelCheckpoint(filepath='or_best_weights.h5', monitor='acc', verbose=2, save_best_only=True, mode='max')]

In [11]:
model.fit([x_tr,y_tr_in], y_tr_out, epochs = 50, callbacks=Callbacks, batch_size = 50)

Epoch 1/50
Epoch 1: acc improved from -inf to 0.31375, saving model to or_best_weights.h5
Epoch 2/50
Epoch 2: acc improved from 0.31375 to 0.33497, saving model to or_best_weights.h5
Epoch 3/50
Epoch 3: acc improved from 0.33497 to 0.34721, saving model to or_best_weights.h5
Epoch 4/50
Epoch 4: acc improved from 0.34721 to 0.35822, saving model to or_best_weights.h5
Epoch 5/50
Epoch 5: acc improved from 0.35822 to 0.36761, saving model to or_best_weights.h5
Epoch 6/50
Epoch 6: acc improved from 0.36761 to 0.37862, saving model to or_best_weights.h5
Epoch 7/50
Epoch 7: acc improved from 0.37862 to 0.40310, saving model to or_best_weights.h5
Epoch 8/50
Epoch 8: acc improved from 0.40310 to 0.42513, saving model to or_best_weights.h5
Epoch 9/50
Epoch 9: acc improved from 0.42513 to 0.45002, saving model to or_best_weights.h5
Epoch 10/50
Epoch 10: acc improved from 0.45002 to 0.47450, saving model to or_best_weights.h5
Epoch 11/50
Epoch 11: acc improved from 0.47450 to 0.49368, saving mode

Epoch 33: acc improved from 0.90779 to 0.92289, saving model to or_best_weights.h5
Epoch 34/50
Epoch 34: acc improved from 0.92289 to 0.92901, saving model to or_best_weights.h5
Epoch 35/50
Epoch 35: acc improved from 0.92901 to 0.93921, saving model to or_best_weights.h5
Epoch 36/50
Epoch 36: acc improved from 0.93921 to 0.94778, saving model to or_best_weights.h5
Epoch 37/50
Epoch 37: acc improved from 0.94778 to 0.95716, saving model to or_best_weights.h5
Epoch 38/50
Epoch 38: acc did not improve from 0.95716
Epoch 39/50
Epoch 39: acc improved from 0.95716 to 0.96818, saving model to or_best_weights.h5
Epoch 40/50
Epoch 40: acc did not improve from 0.96818
Epoch 41/50
Epoch 41: acc improved from 0.96818 to 0.97348, saving model to or_best_weights.h5
Epoch 42/50
Epoch 42: acc improved from 0.97348 to 0.97634, saving model to or_best_weights.h5
Epoch 43/50
Epoch 43: acc improved from 0.97634 to 0.97797, saving model to or_best_weights.h5
Epoch 44/50
Epoch 44: acc improved from 0.97797

<keras.callbacks.History at 0x18d97be40d0>

In [12]:
# creating the inference model
# load pretrained weights
model = keras.models.load_model("or_best_weights.h5")

In [13]:
# encoder inference model
encoder_model_i = keras.models.Model(inputs= encoder_inputs, outputs=[encoder_output, state_h, state_c])

# decoder setup
decoder_state_input_h = keras.layers.Input(shape=(latent_dim,))
decoder_state_input_c = keras.layers.Input(shape=(latent_dim,))
decoder_hidden_state_input = keras.layers.Input(shape=(max_encoder_len, latent_dim))

decoder_embed_i = decoder_embed_layer(decoder_inputs)

decoder_output_i, state_h_i, state_c_i = decoder_LSTM(decoder_embed_i, initial_state = [decoder_state_input_h, decoder_state_input_c])

decoder_output_i = decoder_dense(decoder_output_i)

# final decoder inference model
decoder_model_i = keras.models.Model([decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c], [decoder_output_i] + [state_h_i, state_c_i])

In [14]:
def decode_sequence(input_seq):
    e_out,e_h, e_c = encoder_model_i.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0,0] = or_to_int['<']
    
    stop_condition = False
    decoded_sentence = []
    
    while not stop_condition:
        (output_tokens, h, c) = decoder_model_i.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = int_to_or[sampled_token_index]   
        
        if sampled_token != '>':
            print(sampled_token)
            decoded_sentence += [sampled_token]

        # Exit condition: either hit max length or find the stop word.
        if (sampled_token == '>') or (len(decoded_sentence) >= max_decoder_len):
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)
    return decoded_sentence


In [15]:
def word2seq(input_word):
    final_seq = []
    for c in input_word:
        final_seq += [alpha_to_int[c]]
    final_seq = keras.preprocessing.sequence.pad_sequences([final_seq], maxlen=max_encoder_len, padding='post')[0]
    return final_seq

In [24]:
word = input("Enter a single syllable word: ")
word_seq = word2seq(word).reshape(1, max_encoder_len)
print(decode_sequence(word_seq))

Enter a single syllable word: crink
cr
ink
['cr', 'ink']
