Some resources for the project

 - http://www.speech.cs.cmu.edu/cgi-bin/cmudict
 - http://www.nltk.org/book/ch11.html
 - https://en.wikipedia.org/wiki/Mnemonic_major_system
 - https://github.com/rhdunn/cmudict-tools

In [158]:
execfile('/home/carlos/projects/utils/utils.py')

In [159]:
def parse_mnemonic_line(line):
    phoneme, value = line.split(" ")
    return (phoneme, value)

mnemonic_map = tz.thread_last(
    "./mnemonics.txt",
    read_lines,
    (map, parse_mnemonic_line),
    dict
)

mnemonic_phonemes = mnemonic_map.keys()
illegal_phonemes = [phoneme for phoneme, value in mnemonic_map.items() if value == "X"]
legal_phonemes = [phoneme for phoneme, value in mnemonic_map.items() if value != "X"]

In [166]:
def parse_cmu_line(line):
    chunks = line.split()
    word = strip_suffix(chunks[0], ["(1)", "(2)", "(3)", "(4)"])
    drop_numbers = lambda string: "".join([char for char in string if char not in "0123456789"])
    is_consonant = lambda phoneme: phoneme in mnemonic_phonemes
    
    all_phonemes = map(drop_numbers, chunks[1:]) # emphasis annotations aren't needed here
    phonemes = filter(is_consonant, all_phonemes) # vowels aren't part of the mnemonic
    numbers = [mnemonic_map[phoneme] for phoneme in phonemes]
    
    if (nvenn(phonemes, illegal_phonemes)[1] > 0) or (nvenn(phonemes, legal_phonemes)[1] < 1):
        return [] # ignore words with illegal phonemes, or which contain no allowed phonemes
    else: 
        return [{"word": word, "phonemes": phonemes, "all_phonemes": all_phonemes, "numbers": numbers}]
    
cmu = tz.thread_last(
    './cmudict.txt',
    read_file,
    split_lines,
    (filter, lambda line: not line.startswith(";;;")),
    (tz.mapcat, parse_cmu_line),
    list
)

In [167]:
def search_words(f): 
    return [entry for entry in cmu if f(entry['word'])]

def search_numbers(f): 
    return [entry for entry in cmu if f(entry['numbers'])]

def search_phonemes(f): 
    return [entry for entry in cmu if f(entry['phonemes'])]

In [168]:
def partition(n, seq):
    if len(seq) <= n:
        return [seq]
    else: 
        return [seq[:n]] + partition(n, seq[n:])
    
def translate(numbers):
    return tz.thread_last(
        numbers, 
        str,
        list,
        (partition, 3),
        (map, lambda query_numbers: search_numbers(lambda numbers: numbers == query_numbers)),
        list
    )
translate(1)

[[{'all_phonemes': ['AE', 'D'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'AD'},
  {'all_phonemes': ['EY', 'D', 'AH'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'ADA'},
  {'all_phonemes': ['AE', 'D', 'AA'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'ADAH'},
  {'all_phonemes': ['AH', 'D', 'EY'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'ADAY'},
  {'all_phonemes': ['AE', 'D'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'ADD'},
  {'all_phonemes': ['AA', 'D', 'IY', 'OW'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'ADDEO'},
  {'all_phonemes': ['AE', 'D', 'IY'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'ADDIE'},
  {'all_phonemes': ['AE', 'D', 'IY'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'ADDY'},
  {'all_phonemes': ['EY', 'D'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'ADE'},
  {'all_phonemes': ['AH', 'D', 'IY'],
   'numbers': ['1'],
   'phonemes': ['D'],
   'word': 'ADEE'},
  {'a

Some phonemes may not be separated by a space, or something. "ER0" Seems to be present, and I'm treating it like a vowel, but it's not. 

In [156]:


partition(3, "1")

['1']

In [175]:
from nltk.corpus import wordnet as wn

In [182]:
wn.synsets('HEAD')

[Synset('head.n.01'),
 Synset('head.n.02'),
 Synset('mind.n.01'),
 Synset('head.n.04'),
 Synset('head.n.05'),
 Synset('head.n.06'),
 Synset('head.n.07'),
 Synset('fountainhead.n.02'),
 Synset('head.n.09'),
 Synset('head.n.10'),
 Synset('head.n.11'),
 Synset('capitulum.n.01'),
 Synset('principal.n.02'),
 Synset('head.n.14'),
 Synset('head.n.15'),
 Synset('promontory.n.01'),
 Synset('head.n.17'),
 Synset('head.n.18'),
 Synset('forefront.n.01'),
 Synset('pass.n.09'),
 Synset('headway.n.02'),
 Synset('point.n.20'),
 Synset('question.n.02'),
 Synset('heading.n.01'),
 Synset('head.n.25'),
 Synset('head.n.26'),
 Synset('read/write_head.n.01'),
 Synset('head.n.28'),
 Synset('head.n.29'),
 Synset('head.n.30'),
 Synset('head.n.31'),
 Synset('drumhead.n.01'),
 Synset('oral_sex.n.01'),
 Synset('head.v.01'),
 Synset('head.v.02'),
 Synset('lead.v.04'),
 Synset('head.v.04'),
 Synset('steer.v.01'),
 Synset('head.v.06'),
 Synset('head.v.07'),
 Synset('head.v.08'),
 Synset('head.v.09')]

In [181]:
[x.pos() for x in wn.synsets('HEAD')]

[u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'n',
 u'v',
 u'v',
 u'v',
 u'v',
 u'v',
 u'v',
 u'v',
 u'v',
 u'v']

In [179]:
import nltk

In [173]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [174]:
2 + 2

4