In [1]:
import nltk
from nltk.corpus import cmudict as cmu

entries = cmu.entries()
arpabet = cmu.dict()

In [23]:
def get_phonem(text):
    try:
        return arpabet[text][0]
    except:
        return text

def get_phonem_string(text):
    try:
        return "".join(arpabet[text][0])
    except:
        return text

def get_phonem_string_spaced(text):
    try:
        return " ".join(arpabet[text][0])
    except:
        return text

In [28]:
print("entry #50:", entries[50])
print()
# show entries for "hello"
for entry in entries:
    if entry[0] == "hello":
        print(entry)

entry #50: ('abash', ['AH0', 'B', 'AE1', 'SH'])

('hello', ['HH', 'AH0', 'L', 'OW1'])
('hello', ['HH', 'EH0', 'L', 'OW1'])


In [5]:
text = "dream"
print(text, " --> ", get_phonem(text))
print(text, " --> ", get_phonem_string(text))

dream  -->  ['D', 'R', 'IY1', 'M']
dream  -->  D R IY1 M


In [6]:
text2 = "scheme"
print(text2, " --> ", get_phonem(text2))
print(text2, " --> ", get_phonem_string(text2))

scheme  -->  ['S', 'K', 'IY1', 'M']
scheme  -->  S K IY1 M


In [7]:
# get ryhme for word
# level represents accuracy of rhyme: take the last {level} phonems to compare
def rhyme(inp, level):
    syllables = [(word, syl) for word, syl in entries if word == inp]
    rhymes = []
    for (word, syllable) in syllables:
        rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
    return list(set(rhymes))

In [31]:
"dream" in rhyme("scheme", 2)


True

In [40]:
import pronouncing
pronouncing.rhymes("scheme")[12:20]
#pronouncing.phones_for_word("scheme")

['deam', 'deem', 'diehm', 'diem', 'downstream', 'dream', 'esteem', 'extreme']

# Process rap lyrics to phonems

In [10]:
import tools.processing as pre
text = pre.get_text("data/cleaned-rap-lyrics/clean2_pac.txt")
text[:100]

'as real as it seems the american dream ; is not nothing but another calculated schemes ; to get us l'

### Tokenize cleaned text

In [11]:
default_wt = nltk.word_tokenize
words = default_wt(text)
len(words)

118932

In [41]:
words[:9]

['as', 'real', 'as', 'it', 'seems', 'the', 'american', 'dream', ';']

In [43]:
for word in words[:17]:
    print(word, "\t", get_phonem_string(word), "\t", get_phonem_string_spaced(word))

as 	 AE1Z 	 AE1 Z
real 	 RIY1L 	 R IY1 L
as 	 AE1Z 	 AE1 Z
it 	 IH1T 	 IH1 T
seems 	 SIY1MZ 	 S IY1 M Z
the 	 DHAH0 	 DH AH0
american 	 AH0MEH1RAH0KAH0N 	 AH0 M EH1 R AH0 K AH0 N
dream 	 DRIY1M 	 D R IY1 M
; 	 ; 	 ;
is 	 IH1Z 	 IH1 Z
not 	 NAA1T 	 N AA1 T
nothing 	 NAH1THIH0NG 	 N AH1 TH IH0 NG
but 	 BAH1T 	 B AH1 T
another 	 AH0NAH1DHER0 	 AH0 N AH1 DH ER0
calculated 	 KAE1LKYAH0LEY2TAH0D 	 K AE1 L K Y AH0 L EY2 T AH0 D
schemes 	 SKIY1MZ 	 S K IY1 M Z
; 	 ; 	 ;


In [24]:
# transfer text to phonetics-text
phonem_text = (" ").join([get_phonem_string(word) for word in words]) 
phonem_text_spaced = (" ").join([get_phonem_string_spaced(word) for word in words]) 
#print(phonem_text[:9])
phonem_words = default_wt(phonem_text)
print(phonem_words[:9])    

phonem_words_spaced = default_wt(phonem_text_spaced)
print(phonem_words_spaced[:9])    

['AE1Z', 'RIY1L', 'AE1Z', 'IH1T', 'SIY1MZ', 'DHAH0', 'AH0MEH1RAH0KAH0N', 'DRIY1M', ';']
['AE1', 'Z', 'R', 'IY1', 'L', 'AE1', 'Z', 'IH1', 'T']


In [27]:
# saving in file
pre.write_text("data/phonem-rap-lyrics/phonem2_pac.txt", phonem_text)

### Transfer findings into own library

In [44]:
import tools.phonetics as phon

In [46]:
phon.get_phonem("library")

['L', 'AY1', 'B', 'R', 'EH2', 'R', 'IY0']

In [48]:
phon.rhyme("library", 4)

['library', 'arbitrary']