<a href="https://colab.research.google.com/github/coryskeers/dl_phones/blob/master/g2p.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Download updated CMU ARPABET dictionaries and definitions if they aren't available:
import os.path

if not os.path.exists('cmudict-0.7b.symbols'):
  !wget http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols
#!wget http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.phones
if not os.path.exists('cmudict-0.7b'):
  !wget http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b

# Build our phoneme descriptions
# Based on 'Poetic Sound Similarity Vectors Using Phonetic Features' which uses X-SAMPA phoneme descriptions
# Allison Parrish; 2017
# https://aaai.org/ocs/index.php/AIIDE/AIIDE17/paper/download/15879/15227
phone_defs = {
    'AA' : ['bck', 'low', 'unr', 'vwl'],
    'AE' : ['fnt', 'low', 'unr', 'vwl'],
    'AH' : ['cnt', 'mid', 'unr', 'vwl'],
    'AO' : ['bck', 'lmd', 'rnd', 'vwl'],
    'AW' : ['bck', 'cnt', 'low', 'rnd', 'smh', 'unr', 'vwl'],
    'AY' : ['cnt', 'fnt', 'low', 'smh', 'unr', 'vwl'],
    'B' : ['blb', 'stp', 'vcd'],
    'CH' : ['alv', 'frc', 'stp', 'vls'],
    'D' : ['alv', 'stp', 'vcd'],
    'DH' : ['dnt', 'frc', 'vcd'],
    'EH' : ['fnt', 'lmd', 'unr', 'vwl'],
    'ER' : ['cnt', 'rzd', 'umd', 'vwl'],
    'EY' : ['fnt', 'lmd', 'smh', 'unr', 'vwl'],
    'F' : ['frc', 'lbd', 'vls'],
    'G' : ['stp', 'vcd', 'vel'],
    'HH' : ['apr', 'glt'],
    'IH' : ['fnt', 'smh', 'unr', 'vwl'],
    'IY' : ['fnt', 'hgh', 'unr', 'vwl'],
    'JH' : ['alv', 'frc', 'stp', 'vcd'],
    'K' : ['stp', 'vel', 'vls'],
    'L' : ['alv', 'lat'],
    'M' : ['blb', 'nas'],
    'N' : ['alv', 'nas'],
    'NG' : ['nas', 'vel'],
    'OW' : ['bck', 'rnd', 'smh', 'umd', 'vwl'],
    'OY' : ['bck', 'fnt', 'lmd', 'rnd', 'smh', 'unr', 'vwl'],
    'P' : ['blb', 'stp', 'vls'],
    'R' : ['alv', 'apr'],
    'S' : ['alv', 'frc', 'vls'],
    'SH' : ['frc', 'pla', 'vls'],
    'T' : ['alv', 'stp', 'vls'],
    'TH' : ['dnt', 'frc', 'vls'],
    'UH' : ['bck', 'rnd', 'smh', 'vwl'],
    'UW' : ['bck', 'hgh', 'rnd', 'vwl'],
    'V' : ['frc', 'lbd', 'vcd'],
    'W' : ['apr', 'lbv'],
    'Y' : ['apr', 'pal'],
    'Z' : ['alv', 'frc', 'vcd'],
    'ZH' : ['frc', 'pla', 'vcd']
}

# Get all symbol combos and create an indexer
with open('cmudict-0.7b.symbols') as symfile:
  phone_index = {}
  index_phone = {}
  i = 0
  for line in symfile:
    phone_index[line.strip()] = i
    index_phone[i] = line.strip()
    i += 1

# Get our word-phoneme dict. First 69 lines are documentation and/or symbol pronunciations.
with open('cmudict-0.7b', encoding = 'latin-1') as pronfile:
  word_pron = {}
  pron_word = {}
  for _ in range(69):
    next(pronfile)
  for line in pronfile:
    entry = line.strip().split('  ', 1)
    # Add a 'break' token at the beginning and end of the word
    word_pron[entry[0]] = ['<beg>'] + entry[1].split() + ['<end>']
    pron_word[entry[1]] = entry[0]

--2020-02-24 15:30:15--  http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols
Resolving svn.code.sf.net (svn.code.sf.net)... 216.105.38.17
Connecting to svn.code.sf.net (svn.code.sf.net)|216.105.38.17|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 281 [text/plain]
Saving to: ‘cmudict-0.7b.symbols’


2020-02-24 15:30:15 (48.3 MB/s) - ‘cmudict-0.7b.symbols’ saved [281/281]

--2020-02-24 15:30:16--  http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b
Resolving svn.code.sf.net (svn.code.sf.net)... 216.105.38.17
Connecting to svn.code.sf.net (svn.code.sf.net)|216.105.38.17|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3716714 (3.5M) [text/plain]
Saving to: ‘cmudict-0.7b’


2020-02-24 15:30:17 (4.99 MB/s) - ‘cmudict-0.7b’ saved [3716714/3716714]



In [0]:
# Vectorize our individual phonemes
attrs = set()
for values in phone_defs.values():
  attrs.update(values)
attrs = sorted(list(attrs))
phone_vec = {}
i = 0
for attr in attrs:
  phone_vec[attr] = i
  i += 1
for phone, attrs in phone_defs.items():
  phone_defs[phone] = [0] * 30
  for attr in attrs:
    phone_defs[phone][phone_vec[attr]] = 1
# Include our break tokens, with a separate 'pause' feature
phone_defs['<beg>'] = [0] * 29 + [1]
phone_defs['<end>'] = [0] * 29 + [1]

In [0]:
def hammingDiff(vec1, vec2):
  ''' Simple calc for Hamming distance 
  between 2 equal length binary vectors.
  Can be used for both individual phonemes,
  and appended phoneme bigrams/trigrams
  (with increasing vectorspace)'''
  return sum([1 for i, j in zip(vec1, vec2) if i != j])

def closestPhone(vec):
  ''' Return a tuple including the closest phoneme to the given vector
  based on phoneme attributes, as well as its Hamming distance.'''
  d = []
  for phone, attr in phone_defs.items():
    d.append((phone, hammingDiff(vec, attr)))
  return min(d, key = lambda x: x[1])

In [0]:
# Let's explore the number of possible bigrams and trigrams for phonemes
# based on the cmu-dict. This includes syllabic emphasis as different phonemes.
bigrams = set()
trigrams = set()
bigrams_without_syllabic = set()
trigrams_without_syllabic = set()
syl_emph = '0123'
for pron in list(word_pron.values()):
  for i in range(len(pron) - 1):
    p1 = pron[i]
    p2 = pron[i + 1]
    if p1[-1] in syl_emph:
      p1 = p1[:-1]
    if p2[-1] in syl_emph:
      p2 = p2[:-1]
    bigrams.add((pron[i], pron[i + 1]))
    bigrams_without_syllabic.add((p1, p2))
    if i < len(pron) - 2:
      p3 = pron[i + 2]
      if p3[-1] in syl_emph:
        p3 = p3[:-1]
      trigrams.add((pron[i], pron[i + 1], pron[i + 2]))
      trigrams_without_syllabic.add((p1, p2, p3))
print(len(bigrams))
print(len(bigrams_without_syllabic))
print(len(trigrams))
print(len(trigrams_without_syllabic))

3024
1346
38588
19559


Including syllabic emphasis as separate phonemes, cmu-dict includes 3024 different phoneme bigrams and 38588 different phoneme trigrams.
Reduces to 1346 and 19559 if syllabic emphasis is removed.

Distance between individual phonemes varies from ~4.3 to ~7.8, with an average of ~5.6.

In [0]:
from statistics import mean
l =[]
for p, a in phone_defs.items():
  l.append(mean([hammingDiff(a, x) for x in phone_defs.values()]))
print(min(l))
print(max(l))
print(mean(l))

4.2682926829268295
7.780487804878049
5.539559785841761


In [0]:
%tensorflow_version 1.x
import tensorflow as tf
from tensorflow import keras
import numpy as np

input_chars = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
num_encoder_tokens = len(input_chars)

#### This should be updated to included syllabic emphasis
output_chars = sorted(list(phone_defs.keys()))
num_decoder_tokens = len(output_chars)

max_input_seq_length = max([len(key) for key in word_pron.keys()])
max_output_seq_length = max([len(value) for value in word_pron.values()])

input_data_length = len(list(word_pron.keys())
encoder_input_data = np.zeros(
    (input_data_length, max_input_seq_length, num_encoder_tokens),
    dtype = 'float32'
)

decoder_input_data = np.zeros(
    (input_data_length, max_output_seq_length, num_decoder_tokens),
    dtype = 'float32'
)

decoder_output_data = np.zeros(
    (input_data_length, max_output_seq_length, num_decoder_tokens),
    dtype = 'float32'
)

for i, (word, pron) in enumerate(word_pron.items()):
  for j, char in enumerate(word):
    encoder_input_data[i, ]

In [0]:
# Set our constants
batch_size = 64
epochs = 100
latent_dim = 256

In [0]:
encoder_inputs = keras.layers.Input(shape = (None, num_encoder_tokens))
encoder = keras.layers.GRU(latent_dim, return_state = True)
encoder_outputs, state_h = encoder(encoder_inputs)

decoder_inputs = keras.layers.Input(shape = (None, num_decoder_tokens))
decoder_gru = keras.layers.GRU(latent_dim, return_sequences = TRUE)
decoder_outputs = decoder_gru(decoder_inputs, initial_state = state_h)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [0]:
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size = batch_size,
          epochs = epochs,
          validation_split = 0.2)