In [1]:
import toolz as tz
execfile("./utils.py")

Some resources for the project

 - http://www.speech.cs.cmu.edu/cgi-bin/cmudict
 - http://www.nltk.org/book/ch11.html
 - https://en.wikipedia.org/wiki/Mnemonic_major_system
 - https://github.com/rhdunn/cmudict-tools

In [2]:
def parse_mnemonic_line(line):
    phoneme, value = line.split(" ")
    return (phoneme, value)

mnemonic_map = tz.thread_last(
    "./mnemonics.txt",
    read_lines,
    (map, parse_mnemonic_line),
    dict
)

mnemonic_phonemes = mnemonic_map.keys()
illegal_phonemes = [phoneme for phoneme, value in mnemonic_map.items() if value == "X"]
legal_phonemes = [phoneme for phoneme, value in mnemonic_map.items() if value != "X"]

In [3]:
def parse_cmu_line(line):
    chunks = line.split()
    word = strip_suffix(chunks[0], ["(1)", "(2)", "(3)", "(4)"])
    drop_numbers = lambda string: "".join([char for char in string if char not in "0123456789"])
    is_consonant = lambda phoneme: phoneme in mnemonic_phonemes
    
    all_phonemes = map(drop_numbers, chunks[1:]) # emphasis annotations aren't needed here
    phonemes = filter(is_consonant, all_phonemes) # vowels aren't part of the mnemonic
    numbers = [mnemonic_map[phoneme] for phoneme in phonemes]
    
    if (nvenn(phonemes, illegal_phonemes)[1] > 0) or (nvenn(phonemes, legal_phonemes)[1] < 1):
        return [] # ignore words with illegal phonemes, or which contain no allowed phonemes
    else: 
        return [{"word": word, "phonemes": phonemes, "all_phonemes": all_phonemes, "numbers": numbers}]
    
cmu = tz.thread_last(
    './cmudict.txt',
    read_file,
    split_lines,
    (filter, lambda line: not line.startswith(";;;")),
    (tz.mapcat, parse_cmu_line),
    list
)

In [4]:
def search_words(f): 
    return [entry for entry in cmu if f(entry['word'])]

def search_numbers(f): 
    return [entry['word'] for entry in cmu if f(entry['numbers'])]

def search_phonemes(f): 
    return [entry for entry in cmu if f(entry['phonemes'])]

In [5]:
def partition(n, seq):
    if len(seq) <= n:
        return [seq]
    else: 
        return [seq[:n]] + partition(n, seq[n:])
    
def translate(numbers):
    return tz.thread_last(
        numbers, 
        str,
        list,
        (partition, 3),
        (map, lambda query_numbers: search_numbers(lambda numbers: numbers == query_numbers)),
        list
    )

# Test it out!
Try converting some numbers into words

In [6]:
translate(123)

[['AUTONOMY',
  'AUTONOMY',
  'DENHAM',
  'DENIM',
  'DENOMME',
  'DONHAM',
  'DOWNHAM',
  'DOWNUM',
  'DUNHAM',
  'DUNNAM',
  'DYNAMO',
  'IDEONOMY']]

In [7]:
translate(54323432)

[['ALARM', 'LARAMEE', 'LARAMIE', 'LERMA', 'LERUM'],
 ['ANAMARIA',
  'ANYMORE',
  'ENAMOR',
  'HANMER',
  'INAMURA',
  'NAMARA',
  'NAMER',
  'NEIMEYER',
  'NEMER',
  'NEMIR',
  'NEUMAIER',
  'NEUMAYER',
  'NEUMEIER',
  'NEUMEYER',
  'NEWMEYER',
  'NEWMYER',
  'NIEMEIER',
  'NIEMEYER',
  'NIMMER',
  'NOMURA',
  'NUMEIRI',
  'WEINHEIMER'],
 ['AHMANN',
  'AIMAN',
  'AIMONE',
  'AMAN',
  'AMANA',
  'AMANN',
  'AMANO',
  'AMEEN',
  'AMEN',
  'AMEN',
  'AMIN',
  'AMINO',
  'AMMAN',
  'AMMAN',
  'AMMANN',
  'AMMEEN',
  'AMMON',
  'AMMONIA',
  'AMNIO',
  'AMON',
  'AUMAN',
  'AUMANN',
  'EAMON',
  'EHMAN',
  'EHMANN',
  'EHMEN',
  'EMINA',
  'EYMAN',
  'HAMAN',
  'HAMANN',
  'HAMMAN',
  'HAMMANN',
  'HAMMEN',
  'HAMMON',
  'HAMON',
  'HAYMAN',
  'HAYMON',
  'HEHMAN',
  'HEIMAN',
  'HEIMANN',
  'HEMAN',
  'HEMANI',
  'HEMANN',
  'HEMENWAY',
  'HEMMEN',
  'HEUMAN',
  'HEUMANN',
  'HEYMAN',
  'HEYMANN',
  'HIGHMAN',
  'HOHMAN',
  'HOHMANN',
  'HOMAN',
  'HOMANN',
  'HOMEN',
  'HOMINY',
  'HOW-MAN

In [7]:
[x[0] for x in translate(54323432)]

['ALARM', 'ANAMARIA', 'AHMANN']

# Next steps


1. Find good metric for results. Should be easy to remember and visualize.
 - maybe use part of speech
 - manually select your favorite matches
 - best is probably to assess how common the word is (how? using other nltk corpus?)
 
 
If I only allow for 3-digit matches, then that 10^3 combinations to think about. So, there are around 1000 cases to handle. For each of these, it might be good to have several words to choose from so things are more interesting. Some manual work is conceivable.

In [8]:
# Try to get parts of speech
from nltk.corpus import wordnet as wn
wn.synsets('HEAD')[:5]
[x.pos() for x in wn.synsets('HEAD')[:5]]

[u'n', u'n', u'n', u'n', u'n']

## Word frequencies
This could help pick the best match from a set of options. 
 - http://subtlexus.lexique.org/
 - http://www.natcorp.ox.ac.uk/

In [8]:
from nltk.corpus import brown
import pandas as pd

In [16]:
x = brown.words()
len(x)

1161192

In [36]:
n_words = float(len(x))

In [37]:
word_counts = { word.lower():count/n_words for word, count in tz.frequencies(x).iteritems()}

# Assign scores to potential matches

In [34]:
result = translate(12341)
# result

In [32]:
sorted([{'a': 10}, {'a': 2}], key = lambda x: x['a'])

[{'a': 2}, {'a': 10}]

In [48]:
prepositions = ["with" ,"at" ,"from" ,"into" ,"during" ,"including" ,"until" ,"against" ,"among" ,"throughout" ,"despite" ,"towards" ,"upon" ,"concerning" ,"of" ,"to" ,"in" ,"for" ,"on" ,"by" ,"about" ,"like" ,"through" ,"over" ,"before" ,"between" ,"after" ,"since" ,"without" ,"under" ,"within" ,"along" ,"following" ,"across" ,"behind" ,"beyond" ,"plus" ,"except" ,"but" ,"up" ,"out" ,"around" ,"down" ,"off" ,"above" ,"near"]
def score_word(match):
    """ String -> Float
        
        Return word frequency in corpus as score. 
    """
    word = word.lower()
    
    # prepositions aren't memorable
    if word in prepositions:
        return 0
    
    if word in word_counts.keys():
        return word_counts[word]
    else: 
        return 0

def get_best_match(matches):
    scored_matches = [{"word": match, "score": score_match(match)} for match in matches]
    return sorted(scored_matches, key = lambda match: match['score'])[-1]

In [49]:
test = result[0]
test

['AUTONOMY',
 'AUTONOMY',
 'DENHAM',
 'DENIM',
 'DENOMME',
 'DONHAM',
 'DOWNHAM',
 'DOWNUM',
 'DUNHAM',
 'DUNNAM',
 'DYNAMO',
 'IDEONOMY']

In [53]:
map(get_best_match, translate(1))

[{'score': 0.022159987323371155, 'word': 'TO'}]