In [1]:
import nltk
from nltk.corpus import cmudict as cmu

entries = cmu.entries()
arpabet = cmu.dict()

In [None]:
unknown_words = set()

In [None]:
def get_phonem(text):
    try:
        return arpabet[text][0]
    except:
        unknown_words.add(text)
        return text

def get_phonem_string(text):
    try:
        return "".join(arpabet[text][0])
    except:
        unknown_words.add(text)
        return ""

def get_phonem_string_spaced(text):
    try:
        return " ".join(arpabet[text][0])
    except:
        unknown_words.add(text)
        return ""

In [None]:
print("entry #50:", entries[50])
print()
# show entries for "hello"
for entry in entries:
    if entry[0] == "hello":
        print(entry)

In [None]:
text = "dream"
print(text, " --> ", get_phonem(text))
print(text, " --> ", get_phonem_string(text))

In [None]:
text2 = "scheme"
print(text2, " --> ", get_phonem(text2))
print(text2, " --> ", get_phonem_string(text2))

In [None]:
# get ryhme for word
# level represents accuracy of rhyme: take the last {level} phonems to compare
def rhyme(inp, level):
    syllables = [(word, syl) for word, syl in entries if word == inp]
    rhymes = []
    for (word, syllable) in syllables:
        rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
    return list(set(rhymes))

In [None]:
"dream" in rhyme("scheme", 2)


In [None]:
import pronouncing
pronouncing.rhymes("scheme")[12:20]
#pronouncing.phones_for_word("scheme")

# Process rap lyrics to phonems

In [2]:
import tools.processing as pre
text_2pac = pre.get_text("data/cleaned-rap-lyrics/clean2_pac.txt")
text_kidcudi = pre.get_text("data/cleaned-rap-lyrics/cleankid_cudi.txt")
text_rakim = pre.get_text("data/cleaned-rap-lyrics/cleanrakim.txt")
text_2pac[:100]

'as real as it seems the american dream ; is not nothing but another calculated schemes ; to get us l'

### Tokenize cleaned text

In [None]:
default_wt = nltk.word_tokenize
words_2pac = default_wt(text_2pac)
words_kidcudi = default_wt(text_kidcudi)
words_rakim = default_wt(text_rakim)
len(words_2pac)

In [None]:
print(words_2pac[:9])
print(words_kidcudi[:9])
print(words_rakim[:9])

In [None]:
for word in words_2pac[:17]:
    print(word, "\t", get_phonem_string(word), "\t", get_phonem_string_spaced(word))

In [None]:
# transfer text to phonetics-text
# 2pac
phonem_text = (" ").join([get_phonem_string(word) for word in words_2pac]) 
phonem_2pac = (" ").join([get_phonem_string_spaced(word) for word in words_2pac]) 

phonem_kidcudi = (" ").join([get_phonem_string_spaced(word) for word in words_kidcudi]) 
phonem_rakim = (" ").join([get_phonem_string_spaced(word) for word in words_rakim]) 

phonem_words = default_wt(phonem_text)
print(phonem_words[:9])    

phonem_words_spaced = default_wt(phonem_2pac)
print(phonem_words_spaced[:9])    

In [None]:
len(unknown_words)

In [None]:
if len(unknown_words) > 0:
    unknown_words_list = "\n".join(list(unknown_words))
    print(unknown_words_list)
    pre.write_text("data/cleaned-rap-lyrics/unknown_words.txt", unknown_words_list)

In [None]:
# creating dict form unknown words
unknown_dict = {}
unknown_list = pre.get_text("data/cleaned-rap-lyrics/unknown_words_dict.txt")
splits = unknown_list.split("\n")
for split in splits:
    try:
        word, phonem = split.split(":")
    except:
        print(split)
    unknown_dict[word.lower()] = [phonem.split(" ")]

In [None]:
print(len(arpabet))
print(len(unknown_dict))

In [None]:
# merge both dictionaries 
arpabet = {**arpabet, **unknown_dict}

In [None]:
# saving in file
#pre.write_text("data/phonem-rap-lyrics/phonem_2pac.txt", phonem_2pac)
#pre.write_text("data/phonem-rap-lyrics/phonem_kidcudi.txt", phonem_kidcudi)
#pre.write_text("data/phonem-rap-lyrics/phonem_rakim.txt", phonem_rakim)
#pre.write_text("data/phonem-rap-lyrics/phonem_all3.txt", phonem_2pac + phonem_kidcudi + phonem_rakim)

### Transfer findings into own library

In [3]:
import tools.phonetics as phon

In [4]:
phon.get_phonem("library")

['L', 'AY1', 'B', 'R', 'EH2', 'R', 'IY0']

In [5]:
phon.rhyme("library", 4)

['library', 'arbitrary']

In [None]:
#create own rhyme dict
#for word in sorted(list(set(arpabet))):
count = 0
for word in (arpabet):
    if len(nltk.corpus.wordnet.synsets(word)) > 1:
        print(word, phon.rhyme(word, 2)[:10])
        print()
        count += 1
print(count)

## cleaned text combined

In [6]:
cleaned_rap_lyrics = pre.get_text("data/cleaned-rap-lyrics/ref_text3.txt")
phonem_all = phon.text_to_phonem(cleaned_rap_lyrics)

unknown_words = phon.get_unknown_words()
print("#unknown:", len(unknown_words))

if len(unknown_words) > 0:
    # use logios lextool to get generated phonetics from unknown
    # http://www.speech.cs.cmu.edu/tools/lextool.html
    unknown_list = pre.get_text("data/cleaned-rap-lyrics/unknown_words_dict.txt")
    unknown_dict = phon.create_unknown_dict_from_text(unknown_list)

    phon.update_arpabet(unknown_dict)

    phonem_all = phon.text_to_phonem(cleaned_rap_lyrics) 
    unknown_words = phon.get_unknown_words()
    print("#unknown:", len(unknown_words))

pre.write_text("data/phonem-rap-lyrics/phonem_all.txt", phonem_all)

#unknown: 1410

#unknown: 0
