In [1]:
from collections import namedtuple
from random import random
from random import shuffle

import numpy as np

In [2]:
Word = namedtuple('Word', ['written', 'phonetic'])


def load_words():
    """Loads the words from the CMU phonetic dictionary and parses them into Word objects
    
    Returns:
        list, of Word objects 
    """
    words = []
    bad_lines = 0
    with open('data/cmudict-0.7b', 'rb') as dict_handle:
        for line in dict_handle:
            try:
                line = line.decode("utf-8") 
            except:
                bad_lines += 1
                continue
                
            if line[0] == ';' or len(line) < 2:
                continue # Header or comment line

            line_chunks = str(line[:-1]).split(' ')
            written = line_chunks[0].lower()
            phonetic = [phone for phone in line_chunks[1:] if len(phone) >= 1]
            words.append(Word(written, phonetic))
            
    print('Number of lines that could not be parsed: ', bad_lines)
    return words

# Grab the words and check they loaded correctly.
words = load_words()
n_words = len(words)
sample_word = words[int(random() * len(words))]
print("""
      Found {n_words} words
      Sample word:
          {sample_word}
      """.format(**locals()))

Number of lines that could not be parsed:  1

      Found 133851 words
      Sample word:
          Word(written='bricker', phonetic=['B', 'R', 'IH1', 'K', 'ER0'])
      


In [3]:
class NameRecommender:
    """Base Class for producing name recommendations based on a list of liked
    words from a user. There are possible ways to upgrade this base class and
    they are marked with TODO.
    
    To use:
        # Initialize
        namer = NameRecommender(words, featurizer)
        
        # Example for pulling L2 recommendations for User 1
        user1_features = namer.construct_user_features(['fox', 'box'], ['hello', 'abracadabra'])
        namer.recommend_on_L2_similarity(user1_features, 10)
        
        # Example for pulling L-Infinity recommendations for User 1 after the above lines
        namer.recommend_on_Linf_similarity(user1_features, 10)
        
        # Example for pulling L2 recommendations for User 2
        user2_features = namer.construct_user_features(['hello', 'abracadabra'], ['fox', 'box'])
        namer.recommend_on_L2_similarity(user2_features, 10)
    """
    
    def __init__(self, words, featurizer):
        """Initialize the Recommender language and features
        
        Args:
            words: (list of Word objects), all words in this recommender's language
            featurizer: (function) takes a Word object and returns a np.array featurizing the word
        """
        self.words = words
        self.featurizer = featurizer
        self._construct_word_features()
        
        # Of the possible words limit to candidates to consider
        self.candidates = [
            word.written  
            for word in words
            if True # TODO remove undesireable words with non-alphabetic characters
        ] 
        
    def _construct_word_features(self):
        """Featurizes the dictionary for quick and easy comparison to later.
        """
        self.features = dict((word.written, self.featurizer(word))
                             for word in self.words)
        # TODO add normalization to self.featurizer
        
    def construct_user_features(self, liked_words, disliked_words):
        """Constructs the User Features based on liked and disliked words
        
        Args:
            liked_words: (list of str) the written words a user likes ie ['fox', 'box']
            disliked_words: (list of str) the written words a user dislikes ie ['hello', 'fellow']
            
        Returns:
            (np.array) average features of the user's words.
        """
        token_liked_features = [
            self.features[word]
            for word in liked_words
            if word in self.features
        ]
        liked_features = sum(token_liked_features) / float(len(token_liked_features))
        
        # TODO substract features based on disliked_words
        
        # TODO normalize the feature vector
        
        return liked_features
        
    def _recommend(self, similarity_function, user_features, n, threshold=0.0):
        """Core functionality that provides the recommendations based on the initialized
        recommender and the described user.
        
        Args:
            similarity_function: (function) takes in two feature vectors and returns a float
            user_features: (np.array) a feature vector representing the user's tastes
            n: (int), number of recommendations to return
            
        Kwargs:
            threshold: (float) the tolerance of the best score
            
        Returns:
            (list of str), a randomly ordered list of the candidates with the highest score
        """
        Rank = namedtuple('Rank', ['word', 'score'])
        
        # This is the costly step, comparing a user feature to all other features
        ranked_candidates = [
            Rank(candidate, similarity_function(self.features[candidate], user_features))
            for candidate in self.candidates
        ]
        ranked_candidates.sort(key=lambda candidate: candidate.score, reverse=True)
        
        # Find the best score and limit results to those within 
        best_score = ranked_candidates[0].score
        top_candidates = [
            candidate
            for candidate in ranked_candidates
            if candidate.score >= best_score - threshold
        ]
        print('Found ', len(top_candidates), 'top candidates to use.')
        
        # Shuffle the results for flavor and return TODO enable to change up results
        #shuffle(top_candidates)
        return [candidate.word for candidate in top_candidates[:n]]
        
    def recommend_on_L2_similarity(self, user_features, n, threshold=0.0):
        """Users the L2 norm for detecting the similarity between two feature vectors and
        finds the best recommendations accordingly.
        
        Args:
            user_features: (np.array) a feature vector representing the user's tastes
            n: (int), number of recommendations to return
            
        Kwargs:
            threshold: (float), the tolerance with which return results scoring within the
                threhold of the best score
            
        Returns:
            (list of str), a randomly ordered list of the candidates with the highest score
        """
        def L2(features_1, features_2):
            # Note higher is better, which is the reverse of the customary L2 score, so flipping
            return -1 * sum((features_1 - features_2)**2)**0.5
        
        return self._recommend(L2, user_features, n, threshold)
                
    def recommend_on_Linf_similarity(self, user_features, n, threshold=0.0):
        """Users a modification of the L-inf norm for detecting the similarity between
        two feature vectors and finds the best recommendations accordingly.
        
        Args:
            user_features: (np.array) a feature vector representing the user's tastes
            n: (int), number of recommendations to return
            
        Kwargs:
            threshold: (float), the tolerance with which return results scoring within the
                threhold of the best score

        Returns:
            (list of str), a randomly ordered list of the candidates with the highest score
        """
        def Linf(features_1, features_2):
            return sum(1
                       for (f1, f2) in zip(features_1, features_2)
                       if f1 > 0 and f2 > 0
                      )
        
        return self._recommend(Linf, user_features, n, threshold)
    
    def recommend_on_TODO_similarity(self, user_features, n):
        """If you'd like try a custom similarity ranker. See the above L2 and Linf
        rankers for the implementation pattern.
        """
        raise Exception('Not Implemented')


"""
Handy Dandy local variable for creating featurizers.
Create a mapping from phones to a feature number.
"""
def get_all_phones(words):
    """Gets
    """
    phones = set([])
    for word in words:
        phones.update(word.phonetic)
        
    return list(phones)

all_phones = get_all_phones(words)
phone_to_id = dict((phone, i) for (phone, i) in zip(all_phones, range(len(all_phones))))

In [4]:
print(phone_to_id)

{'V': 0, 'UH2': 1, 'OY1': 2, 'AY0': 3, 'UH1': 4, 'UH0': 5, 'AY1': 6, 'UW2': 7, 'AW0': 8, 'AA0': 9, 'AH1': 10, 'AA2': 11, 'ER2': 12, 'G': 13, 'P': 14, 'ER0': 15, 'Y': 16, 'AA1': 17, 'SH': 18, 'AO1': 19, 'TH': 20, 'DH': 21, 'IH0': 22, 'UW1': 23, 'OW0': 24, 'IH1': 25, 'W': 26, 'ZH': 27, 'ER1': 28, 'IY1': 29, 'K': 30, 'IY0': 31, 'AO2': 32, 'JH': 33, 'HH': 34, 'IY2': 35, 'Z': 36, 'AW2': 37, 'AE0': 38, 'EH1': 39, 'OW1': 40, 'AY2': 41, 'CH': 42, 'T': 43, 'AW1': 44, 'OY2': 45, 'AE2': 46, 'AH0': 47, 'UW0': 48, 'AO0': 49, 'OY0': 50, 'L': 51, 'AH2': 52, 'D': 53, 'EY1': 54, 'M': 55, 'B': 56, 'IH2': 57, 'EY0': 58, 'EH0': 59, 'AE1': 60, 'S': 61, 'R': 62, 'EY2': 63, 'NG': 64, 'EH2': 65, 'OW2': 66, 'F': 67, 'N': 68}


In [5]:
def rhyming_featurizer(word):
    """Basic featurizer looking for words that end the same.
    
    Args:
        word: (Word object) 
        
    Returns:
        (np.array) with non-zero entries based on the ending phones of the word and its length
    """
    features = np.zeros(len(phone_to_id) + 1)
    
    # TODO use the last 2 phones for more exact rhymes
    if len(word.phonetic) >= 2:
        features[phone_to_id[word.phonetic[-2]]] += 1
        
    features[phone_to_id[word.phonetic[-1]]] += 1
    
    # Adding an extra feature to keep words to roughly the same length
    features[-1] = len(word.phonetic)
    
    return features

rhyming_namer = NameRecommender(words, rhyming_featurizer)

In [14]:
"""
Test out the Rhyming Namer!
"""
liked_words = [
    'fox',
    'box',
    'tax',
    'fax',
    'sacks',
    'pen',
    'fen',
    'den'
]
disliked_words = [
    'hello',
    'fellow',
]
fox_features = rhyming_namer.construct_user_features(liked_words, disliked_words)
fox_L2_rhymes = rhyming_namer.recommend_on_L2_similarity(fox_features, 20)
#fox_Linf_rhymes = rhyming_namer.recommend_on_Linf_similarity(fox_features, 20)
print("""
Rhyming Recommendations with only the highest scores based on seed words: {liked_words}.

Based on L2 Similarity:
    {fox_L2_rhymes}
    
Based on Linf Similarity:
    {fox_Linf_rhymes}
""".format(**locals()))

Found  384 top candidates to use.

Rhyming Recommendations with only the highest scores based on seed words: ['fox', 'box', 'tax', 'fax', 'sacks', 'pen', 'fen', 'den'].

Based on L2 Similarity:
    ['arcs', 'arx', 'asks', 'backes', 'backs', 'bakes', 'balks', 'bask', 'basque', 'bax', "beck's", 'becks', 'beeks', 'berkes', 'beske', 'bikes', 'birkes', 'birks', 'bisque', 'bix']
    
Based on Linf Similarity:
    ['aardvarks', 'abex', 'academics', 'aches', 'acoustics', 'acrobatics', 'acrylics', 'acts(1)', 'adaptaplex', "adaptec's", 'adcox', 'addicks', 'adisq', 'adisq(1)', 'aerobatics', 'aerobics', 'aerodynamics', 'aeronautics', 'aesthetics', 'affix']



In [7]:
def matching_featurizer(word):
    """Featurizer trying to match as many sounds as possible.
    
    Args:
        word: (Word object) 
        
    Returns:
        (np.array) with non-zero entries based on the ending phones of the word and its length
    """
    features = np.zeros(len(phone_to_id) + 1)
    for phone in word.phonetic:
        features[phone_to_id[phone]] += 1
        
    # TODO normalize features

    return features

matching_namer = NameRecommender(words, matching_featurizer)

In [8]:
# Try a matching featurizer to get as many sounds as possible in the recommendations
liked_words = [
    'glass',
    'pass',
    'class',
    'glaze'
]
disliked_words = [
    'hello',
    'there',
]
glass_features = matching_namer.construct_user_features(liked_words, disliked_words)
glass_L2_matches = matching_namer.recommend_on_L2_similarity(glass_features, 10)
print("""
Matching Recommendations based on seed words: {liked_words}.

Top Matches Based on L2 Similarity:
    {glass_L2_matches}
""".format(**locals()))

Found  5 top candidates to use.

Matching Recommendations based on seed words: ['glass', 'pass', 'class', 'glaze'].

Top Matches Based on L2 Similarity:
    ['glas', 'glass', 'lass', 'sal', 'slag']



In [9]:
# Let's relax the threshold and see if we get more results
glass_features = matching_namer.construct_user_features(liked_words, disliked_words)
glass_L2_matches = matching_namer.recommend_on_L2_similarity(glass_features, 30, 0.3)
print("""
Matching Recommendations based on seed words: {liked_words}.

Top Matches Based on L2 Similarity:
    {glass_L2_matches}
""".format(**locals()))

Found  30 top candidates to use.

Matching Recommendations based on seed words: ['glass', 'pass', 'class', 'glaze'].

Top Matches Based on L2 Similarity:
    ['glas', 'glass', 'lass', 'sal', 'slag', 'al', 'al.', 'alps', 'ass', 'class', 'gal', 'gas', 'gass', 'klas', 'klass', "lac's", 'lacks', 'lag', 'lapps', 'laps', 'lapse', 'lask', 'lax', 'plas', 'plass', 'plasse', 'sag', "sal's", 'slack', 'slap']



In [10]:
"""
Group Discovery Time!

What type of name recommender would you like to build?


What type of features should it use?


Build the featurizer and initalize the recommender as laid out below

"""

def pet_featurizer(word):
    """TODO what are good features for pet names?
    
    Args:
        word: (Word object) 
        
    Returns:
        (np.array) with non-zero entries based on the ending phones of the word and its length
    """
    features = np.zeros(len(phone_to_id) + 1)
    
    # TODO

    return features

pet_namer = NameRecommender(words, pet_featurizer)


"""
Now let's use the pet_namer
"""
good_pet_names = [
    'spot',
    'socks'
]
bad_pet_names = [
    'nothing',
    'seriously'
]
pet_features = pet_namer.construct_user_features(good_pet_names, bad_pet_names)
pet_L2_matches = pet_namer.recommend_on_L2_similarity(pet_features, 20)
print("""
Matching Recommendations based on seed words: {good_pet_names}.

Top Matches Based on L2 Similarity:
    {pet_L2_matches}
""".format(**locals()))

Found  133851 top candidates to use.

Matching Recommendations based on seed words: ['spot', 'socks'].

Top Matches Based on L2 Similarity:
    ['!exclamation-point', '"close-quote', '"double-quote', '"end-of-quote', '"end-quote', '"in-quotes', '"quote', '"unquote', '#hash-mark', '#pound-sign', '#sharp-sign', '%percent', '&ampersand', "'allo", "'apostrophe", "'bout", "'cause", "'course", "'cuse", "'em"]



In [11]:
"""
Group Discovery Time!

Test the recommender by setting the liked words and disliked words and running!
"""

def football_featurizer(word):
    """TODO what are good features for football names?
    
    Args:
        word: (Word object) 
        
    Returns:
        (np.array) with non-zero entries based on the ending phones of the word and its length
    """
    features = np.zeros(len(phone_to_id) + 1)
    
    # TODO

    return features


football_namer = NameRecommender(words, football_featurizer)
football_liked_words = ['steelers', 'bears', 'seahawks', 'rams', 'eagles', 'panthers']
football_disliked_words = []

football_features = football_namer.construct_user_features(football_liked_words, football_disliked_words)
football_L2_matches = football_namer.recommend_on_L2_similarity(football_features, 10, threshold=0.0)
print("""
Matching Recommendations based on seed words: {football_liked_words}.

Top Matches Based on L2 Similarity:
    {football_L2_matches}
""".format(**locals()))

Found  133851 top candidates to use.

Matching Recommendations based on seed words: ['steelers', 'bears', 'seahawks', 'rams', 'eagles', 'panthers'].

Top Matches Based on L2 Similarity:
    ['!exclamation-point', '"close-quote', '"double-quote', '"end-of-quote', '"end-quote', '"in-quotes', '"quote', '"unquote', '#hash-mark', '#pound-sign']



In [12]:
football_words = [word for word in words if word.written in football_liked_words]
for word in football_words:
    print(word)

Word(written='bears', phonetic=['B', 'EH1', 'R', 'Z'])
Word(written='eagles', phonetic=['IY1', 'G', 'AH0', 'L', 'Z'])
Word(written='panthers', phonetic=['P', 'AE1', 'N', 'TH', 'ER0', 'Z'])
Word(written='rams', phonetic=['R', 'AE1', 'M', 'Z'])
Word(written='seahawks', phonetic=['S', 'IY1', 'HH', 'AO2', 'K', 'S'])
Word(written='steelers', phonetic=['S', 'T', 'IY1', 'L', 'ER0', 'Z'])
