<a href="https://colab.research.google.com/github/coryskeers/dl_phones/blob/master/g2p.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Download updated CMU ARPABET dictionaries and definitions if they aren't available:
import os.path

if not os.path.exists('cmudict-0.7b.symbols'):
  !wget http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols
#!wget http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.phones
if not os.path.exists('cmudict-0.7b'):
  !wget http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b

# Build our phoneme descriptions
# Based on 'Poetic Sound Similarity Vectors Using Phonetic Features' which uses X-SAMPA phoneme descriptions
# Allison Parrish; 2017
# https://aaai.org/ocs/index.php/AIIDE/AIIDE17/paper/download/15879/15227
phone_defs = {
    'AA' : ['bck', 'low', 'unr', 'vwl'],
    'AE' : ['fnt', 'low', 'unr', 'vwl'],
    'AH' : ['cnt', 'mid', 'unr', 'vwl'],
    'AO' : ['bck', 'lmd', 'rnd', 'vwl'],
    'AW' : ['bck', 'cnt', 'low', 'rnd', 'smh', 'unr', 'vwl'],
    'AY' : ['cnt', 'fnt', 'low', 'smh', 'unr', 'vwl'],
    'B' : ['blb', 'stp', 'vcd'],
    'CH' : ['alv', 'frc', 'stp', 'vls'],
    'D' : ['alv', 'stp', 'vcd'],
    'DH' : ['dnt', 'frc', 'vcd'],
    'EH' : ['fnt', 'lmd', 'unr', 'vwl'],
    'ER' : ['cnt', 'rzd', 'umd', 'vwl'],
    'EY' : ['fnt', 'lmd', 'smh', 'unr', 'vwl'],
    'F' : ['frc', 'lbd', 'vls'],
    'G' : ['stp', 'vcd', 'vel'],
    'HH' : ['apr', 'glt'],
    'IH' : ['fnt', 'smh', 'unr', 'vwl'],
    'IY' : ['fnt', 'hgh', 'unr', 'vwl'],
    'JH' : ['alv', 'frc', 'stp', 'vcd'],
    'K' : ['stp', 'vel', 'vls'],
    'L' : ['alv', 'lat'],
    'M' : ['blb', 'nas'],
    'N' : ['alv', 'nas'],
    'NG' : ['nas', 'vel'],
    'OW' : ['bck', 'rnd', 'smh', 'umd', 'vwl'],
    'OY' : ['bck', 'fnt', 'lmd', 'rnd', 'smh', 'unr', 'vwl'],
    'P' : ['blb', 'stp', 'vls'],
    'R' : ['alv', 'apr'],
    'S' : ['alv', 'frc', 'vls'],
    'SH' : ['frc', 'pla', 'vls'],
    'T' : ['alv', 'stp', 'vls'],
    'TH' : ['dnt', 'frc', 'vls'],
    'UH' : ['bck', 'rnd', 'smh', 'vwl'],
    'UW' : ['bck', 'hgh', 'rnd', 'vwl'],
    'V' : ['frc', 'lbd', 'vcd'],
    'W' : ['apr', 'lbv'],
    'Y' : ['apr', 'pal'],
    'Z' : ['alv', 'frc', 'vcd'],
    'ZH' : ['frc', 'pla', 'vcd']
}

# Get all symbol combos and create an indexer
with open('cmudict-0.7b.symbols') as symfile:
  phone_index = {}
  index_phone = {}
  i = 0
  for line in symfile:
    phone_index[line.strip()] = i
    index_phone[i] = line.strip()
    i += 1

# Get our word-phoneme dict. First 69 lines are documentation and/or symbol pronunciations.
with open('cmudict-0.7b', encoding = 'latin-1') as pronfile:
  word_pron = {}
  pron_word = {}
  for _ in range(69):
    next(pronfile)
  for line in pronfile:
    entry = line.strip().split('  ', 1)
    # Add a 'break' token at the beginning and end of the word
    word_pron[entry[0]] = ['<b>'] + entry[1].split() + ['<b>']
    pron_word[entry[1]] = entry[0]

In [0]:
# Vectorize our individual phonemes
attrs = set()
for values in phone_defs.values():
  attrs.update(values)
attrs = sorted(list(attrs))
phone_vec = {}
i = 0
for attr in attrs:
  phone_vec[attr] = i
  i += 1
for phone, attrs in phone_defs.items():
  phone_defs[phone] = [0] * 29
  for attr in attrs:
    phone_defs[phone][phone_vec[attr]] = 1