In [1]:
# Dependencies
import re, warnings
import numpy as np
import pandas as pd

import tensorflow as tf 
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

warnings.filterwarnings('ignore')

# Globals
DATA_PATH = 'data/'
MODEL_PATH = 'models/'

# Read data from source
d = pd.read_csv(DATA_PATH+'english_US_ipa.csv.gz', compression='gzip') \
    .sample(frac=1.0)
# d.head()

In [2]:
# Preprocessing
"""
First order of business is to choose the first of any records that have multiple IPA transcriptions
e.g. /first/, /second/
Second preprocessing step is to remove the wrapping transcription symbols '/','/'
"""

def preprocess_ipa_string(ipa):

    try:
        if ',' in ipa:
            ipa = ipa.split(', ')[0]
        reformatted_ipa = ipa.strip('/')

    except:
        reformatted_ipa = np.nan

    finally:
        return reformatted_ipa

d['formatted_ipa'] = d['ipa'].apply(preprocess_ipa_string)
d = d.drop(['ipa'], axis=1).dropna()
# d.head()

In [3]:
# Tokenization and vocabulary
"""

"""

# 
inputs = d['english'].values
outputs = d['formatted_ipa'].values

# Initialize token sets
input_tokens = set()
output_tokens = set()

# Iterate through dataframe
for i in range(len(d)):
    for token in list(d.iloc[i,0]):
        input_tokens.add(token)
    for token in list(d.iloc[i,1]):
        output_tokens.add(token)

input_tokens = sorted(list(input_tokens))
output_tokens = sorted(list(output_tokens))

# Dimensions
encoder_token_dim = len(input_tokens)
decoder_token_dim = len(output_tokens)

In [4]:
input_features_dict = dict([(token,i) for i, token in enumerate(input_tokens)])
output_features_dict = dict([(token,i) for i, token in enumerate(output_tokens)])

rev_input_features_dict = dict([(i,token) for token, i in enumerate(input_tokens)])
rev_output_features_dict = dict([(i,token) for token, i in enumerate(output_tokens)])

max_encoder_seq_len = max([len(i) for i in inputs])
max_decoder_seq_len = max([len(i) for i in outputs])

In [5]:
encoder_inputs = np.zeros(
    ( len(inputs), max_encoder_seq_len, encoder_token_dim ), dtype='float32')
decoder_inputs = np.zeros(
    ( len(inputs), max_decoder_seq_len, decoder_token_dim ), dtype='float32')
decoder_outputs = np.zeros(
    ( len(outputs), max_decoder_seq_len, decoder_token_dim ), dtype='float32')

In [6]:
for i, (inputs, outputs) in enumerate(zip(inputs, outputs)):

    for timestep, token in enumerate(list(inputs)):
        encoder_inputs[i, timestep, input_features_dict[token]] = 1.
    
    for timestep, token in enumerate(outputs):
        decoder_inputs[i, timestep, output_features_dict[token]] = 1.

        if timestep > 0:
            decoder_outputs[i, timestep-1, output_features_dict[token]] = 1

In [9]:
# Model parameters
n_dims = 64
batch_size = 32
n_epochs = 5

# Model config
# Encoder config
model_encoder_inputs = Input(shape=(None, encoder_token_dim))
model_encoder_lstm = LSTM(n_dims, return_state=True)
model_encoder_outputs, hidden_state, state_cell = model_encoder_lstm(model_encoder_inputs)
model_encoder_states = [hidden_state, state_cell]

# Decoder config
model_decoder_inputs = Input(shape=(None, decoder_token_dim))
model_decoder_lstm = LSTM(n_dims, return_state=True, return_sequences=True)
model_decoder_outputs, decoder_hidden_state, decoder_state_cell = model_decoder_lstm(model_decoder_inputs, initial_state=model_encoder_states)
model_decoder_dense = Dense(decoder_token_dim, activation='softmax')
model_decoder_outputs = model_decoder_dense(model_decoder_outputs)

# Model compilation
model = Model([model_encoder_inputs, model_decoder_inputs], model_decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [10]:
# Run training iterations
history = model.fit([encoder_inputs, decoder_inputs], 
    decoder_outputs, 
    batch_size=batch_size,
    epochs=n_epochs,
    validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
