http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b

In [17]:
from sklearn.model_selection import train_test_split
import re
import numpy as np

In [5]:
lines = [l.strip().split("  ") for l in open("cmudict-0.7b", encoding='latin1') 
         if re.match('^[A-Z]', l)]
lines = [(w, ps.split()) for w, ps in lines]
lines[0], lines[-1]

(('A', ['AH0']), ('ZYWICKI', ['Z', 'IH0', 'W', 'IH1', 'K', 'IY0']))

In [6]:
phonemes = ["_"] + sorted(set(p for w, ps in lines for p in ps))
phonemes[:5]

['_', 'AA0', 'AA1', 'AA2', 'AE0']

In [7]:
len(phonemes)

70

In [9]:
p2i = dict((v, k) for k,v in enumerate(phonemes))
letters = "_abcdefghijklmnopqrstuvwxyz*"
l2i = dict((v, k) for k,v in enumerate(letters))

In [10]:
maxlen=15
pronounce_dict = {w.lower(): [p2i[p] for p in ps] for w, ps in lines
                 if (5<=len(w)<=maxlen) and re.match("^[A-Z]+$", w)}
len(pronounce_dict)

108006

In [11]:
a=['xyz','abc']
[o.upper() for o in a if o[0]=='x'], [[p for p in o] for o in a], [p for o in a for p in o]

(['XYZ'], [['x', 'y', 'z'], ['a', 'b', 'c']], ['x', 'y', 'z', 'a', 'b', 'c'])

In [15]:
maxlen_p = max([len(v) for k,v in pronounce_dict.items()])

In [18]:
pairs = np.random.permutation(list(pronounce_dict.keys()))
n = len(pairs)
input_ = np.zeros((n, maxlen_p), np.int32)
labels_ = np.zeros((n, maxlen), np.int32)

for i, k in enumerate(pairs):
    for j, p in enumerate(pronounce_dict[k]): input_[i][j] = p
    for j, letter in enumerate(k): labels_[i][j] = l2i[letter]

In [19]:
go_token = l2i["*"]
dec_input_ = np.concatenate([np.ones((n,1)) * go_token, labels_[:,:-1]], axis=1)

In [20]:
(input_train, input_test, labels_train, labels_test, dec_input_train, dec_input_test
    ) = train_test_split(input_, labels_, dec_input_, test_size=0.1)

In [25]:
dec_input_

array([[ 27.,   2.,  18., ...,   0.,   0.,   0.],
       [ 27.,  16.,  18., ...,   0.,   0.,   0.],
       [ 27.,   5.,  12., ...,   5.,  20.,  19.],
       ..., 
       [ 27.,  23.,  15., ...,   0.,   0.,   0.],
       [ 27.,  22.,   5., ...,   0.,   0.,   0.],
       [ 27.,  14.,  21., ...,   0.,   0.,   0.]])

In [31]:
input_vocab_size, output_vocab_size = len(phonemes), len(letters)
input_vocab_size, output_vocab_size

(70, 28)

## phonemes to letters (without attention)

In [35]:
from keras.layers.recurrent import LSTM
from keras.layers import Input, Embedding, Dense, Bidirectional, RepeatVector, TimeDistributed
from keras.models import Model

In [26]:
dim = 240

In [33]:
def get_rnn(return_sequences= True): 
    return LSTM(dim, recurrent_dropout= 0.1, dropout= 0.1, 
                implementation= 2, return_sequences=return_sequences)

In [34]:
inp = Input((maxlen_p,))
x = Embedding(input_vocab_size, 120)(inp)

x = Bidirectional(get_rnn())(x)
x = get_rnn(False)(x)

x = RepeatVector(maxlen)(x)
x = get_rnn()(x)
x = get_rnn()(x)
x = TimeDistributed(Dense(output_vocab_size, activation='softmax'))(x)

In [36]:
model = Model(inp, x)

In [38]:
model.compile('Adam', 'sparse_categorical_crossentropy', metrics=['acc'])

In [45]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 16)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 16, 120)           8400      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 16, 480)           693120    
_________________________________________________________________
lstm_6 (LSTM)                (None, 240)               692160    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 15, 240)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 15, 240)           461760    
_________________________________________________________________
lstm_8 (LSTM)                (None, 15, 240)           461760    
__________

In [47]:
hist=model.fit(input_train, np.expand_dims(labels_train,-1), 
          validation_data=[input_test, np.expand_dims(labels_test,-1)], 
          batch_size=64, verbose=1, epochs=3)

Train on 97205 samples, validate on 10801 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [40]:
hist.history['val_loss']

[1.2796057250011674, 0.84694888305690552, 0.58242830510228205]

In [41]:
def eval_keras(input):
    preds = model.predict(input, batch_size=128)
    predict = np.argmax(preds, axis = 2)
    return (np.mean([all(real==p) for real, p in zip(labels_test, predict)]), predict)

In [49]:
acc, preds = eval_keras(input_test); acc

0.332098879733358

In [43]:
def print_examples(preds):
    print("pronunciation".ljust(40), "real spelling".ljust(17), 
          "model spelling".ljust(17), "is correct")

    for index in range(20):
        ps = "-".join([phonemes[p] for p in input_test[index]]) 
        real = [letters[l] for l in labels_test[index]] 
        predict = [letters[l] for l in preds[index]]
        print (ps.split("-_")[0].ljust(40), "".join(real).split("_")[0].ljust(17),
            "".join(predict).split("_")[0].ljust(17), str(real == predict))

In [50]:
print_examples(preds)

pronunciation                            real spelling     model spelling    is correct
P-ER1-P-AH0-S                            purpose           perpuss           False
M-AH0-K-EH1-L-AH0-N                      mckellan          mckellan          True
IH2-N-F-AO1-R-M                          inform            inform            True
AH1-N-D-ER0-G-R-OW2-TH                   undergrowth       undergrothh       False
R-EY1-N-B-OW2                            rainbow           rainbo            False
W-IH1-P-ER0-W-IH2-L-Z                    whippoorwills     whpperwills       False
HH-EH1-N-T-S                             hentz             hentz             True
R-AA1-K-AH0-T-IH0-NG                     rocketing         rocketing         True
K-OW2-L-AH1-M-B-IY0-AH0-N-Z              columbians        columbians        True
W-IH1-N-B-EH2-R-IY0                      winberry          winbery           False
L-AE1-DH-ER0                             lather            lather            True
P-AE1

## with attention model
with encoding/decoding technique, larger input sequences result in less accuracy

this can be mitigated using an attentional model

In [51]:
from keras import backend as K
from keras.initializers import Zeros
from keras.engine import InputSpec
from keras.models import Sequential
from keras.layers import Layer

In [52]:
class Attention(Layer):
    def __init__(self, fn_rnn, nlayers=1, **kwargs):
        self.supports_masking = True
        self.fn_rnn = fn_rnn
        self.nlayers = nlayers
        self.input_spec = [InputSpec(ndim=3), InputSpec(ndim=3)]
        super(Attention, self).__init__(**kwargs)


    def all_attrs(self, name):
        return sum([getattr(layer, name, []) for layer in self.layers], [])


    def w(self, dims, init, name):
        return self.add_weight(dims, init, name.format(self.name))


    def build(self, input_shape):
        self.enc_shape, self.dec_shape = input_shape
        assert len(self.enc_shape) >= 3
        self.layers = [self.fn_rnn() for i in range(self.nlayers)]
        nb_samples, nb_time, nb_dims = self.dec_shape
        l0 = self.layers[0]

        out_shape = self.get_output_shape_for(input_shape)
        for layer in self.layers:
            if not layer.built: layer.build(out_shape)

        init = l0.init
        out_dim = l0.output_dim
        self.W1 = self.w((self.enc_shape[-1], nb_dims), init, '{}_W1')
        self.W2 = self.w((out_dim, nb_dims), init, '{}_W2')
        self.b2 = self.w((nb_dims,), Zeros, '{}_b2')
        self.V =  self.w((nb_dims,), init, '{}_V')
        self.W3 = self.w((nb_dims+out_dim, out_dim), init, '{}_W3')
        self.b3 = self.w((out_dim,), Zeros, '{}_b3')

        self.trainable_weights += self.all_attrs( 'trainable_weights')
        self.non_trainable_weights += self.all_attrs( 'non_trainable_weights')
        self.losses += self.all_attrs( 'losses')
        self.updates = self.all_attrs( 'updates')
        self.constraints = getattr(self.layers[0], 'constraints', {}) # FIXME
        super(Attention, self).build(input_shape)


    def get_output_shape_for(self, input_shape):
        return self.layers[0].get_output_shape_for(input_shape[1])


    def step(self, x, states):
        h = states[0]
        enc_output = states[-1]
        xW1 = states[-2]

        hW2 = K.expand_dims(K.dot(h,self.W2)+self.b2, 1)
        u = K.tanh(xW1+hW2)
        a = K.expand_dims(K.softmax(K.sum(self.V*u,2)), -1)
        Xa = K.sum(a*enc_output,1)
        h = K.dot(K.concatenate([x,Xa],1),self.W3)+self.b3

        for layer in self.layers: h, new_states = layer.step(h, states)
        return h, new_states


    def get_constants(self, enc_output, constants):
        constants.append(K.dot(enc_output,self.W1))
        constants.append(enc_output)
        return constants


    def compute_mask(self, input, mask):
        return self.layers[0].compute_mask(input, mask[1])


    def call(self, x, mask=None):
        l0 = self.layers[0]
        enc_output, dec_input = x

        if l0.stateful: initial_states = l0.states
        else: initial_states = l0.get_initial_states(dec_input)
        constants = l0.get_constants(dec_input)
        constants = self.get_constants(enc_output, constants)
        preprocessed_input = l0.preprocess_input(dec_input)

        last_output, outputs, states = K.rnn(self.step, preprocessed_input,
             initial_states, go_backwards=l0.go_backwards, mask=mask[1],
             constants=constants, unroll=l0.unroll, input_length=self.dec_shape[1])
        if l0.stateful:
            self.updates = []
            for i in range(len(states)):
                self.updates.append((l0.states[i], states[i]))

        return outputs if l0.return_sequences else last_output

In [53]:
inp = Input((maxlen_p,))
inp_dec = Input((maxlen,))
emb_dec = Embedding(output_vocab_size, 120)(inp_dec)
emb_dec = Dense(dim)(emb_dec)

x = Embedding(input_vocab_size, 120)(inp)
x = Bidirectional(get_rnn())(x)
x = get_rnn()(x)
x = get_rnn()(x)
x = Attention(get_rnn, 3)([x, emb_dec])
x = TimeDistributed(Dense(output_vocab_size, activation='softmax'))(x)

ValueError: Dimensions must be equal, but are 15 and 240 for 'dense_3/add' (op: 'Add') with input shapes: [?,15,240], [1,240,1].