In [47]:
import pandas as pd
import pronouncing # we use this library to get rhymes
import random

In [48]:
df = pd.read_csv('https://raw.githubusercontent.com/estambolieva/asigmo_python/master/data/poem_line_data.csv')

In [49]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Poet,Lines,Last_Word
0,0,Song of Myself (1892 version),Walt Whitman,1,1
1,1,Song of Myself (1892 version),Walt Whitman,"I celebrate myself, and sing myself,",myself
2,2,Song of Myself (1892 version),Walt Whitman,"And what I assume you shall assume,",assume
3,3,Song of Myself (1892 version),Walt Whitman,For every atom belonging to me as good belongs...,you
4,4,Song of Myself (1892 version),Walt Whitman,"I loafe and invite my soul,",soul


In [50]:
df.shape

(40427, 5)

In [51]:
df.Lines = df.Lines.astype(str)
df.Title = df.Title.astype(str)
df.Poet = df.Poet.astype(str)
df.Last_Word = df.Last_Word.astype(str)
df.Last_Word = df.Last_Word.str.lower()

In [52]:
word = 'unmask'
rhy = pronouncing.rhymes(word)
rhy.append(word) # <- very very cheap operation for us to execute
rhy

['ask',
 'bask',
 'basque',
 'cask',
 'flask',
 'lask',
 'mask',
 'pasch',
 'pask',
 'pasque',
 'rask',
 'task',
 'trask',
 'unmask']

### Problem - create a new feature which groups the `Last_Word` into rhyming categories

#### 1. Experiment

In [5]:
last_word = df.loc[4].Last_Word
last_word # <- let's find all `Lat_word` which rhyme with `soul`

'soul'

In [6]:
%timeit rhymes = df.apply(lambda row: (last_word in pronouncing.rhymes(row['Last_Word'])), axis=1) 
# get a series which contains only True/False. e.g. whether the last word in each line rhymes with the one given

1.21 s ± 82.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
rhymes = df.apply(lambda row: (last_word in pronouncing.rhymes(row['Last_Word'])), axis=1) 

In [20]:
rhyme_indices = list(rhymes[rhymes == True].index) # gets all indices of last words which rhyme with map
ten_random_indices = random.sample(rhyme_indices, 10) # get only 10 randomly selected indices

for i in ten_random_indices: # prints the last words at these 10 indices for visual inspection
    print(df.loc[i].Last_Word)

roll
pole
whole
toll
Hole
Espanol
hole
control
whole
Soul


In [21]:
df[df.index.isin(ten_random_indices)]

Unnamed: 0.1,Unnamed: 0,Title,Poet,Lines,Last_Word
11780,13552,Falling Water,John Koethe,Adding to the integrity of the whole.,whole
14114,15901,The Snake Doctors,Frank Stanford,The moon shined through the chink hole,hole
16072,18134,The Untold Witch,Keith Waldrop,There are whole,whole
18150,20257,Cruising 99,Garrett Hongo,VII.Body & Fender/Body & Soul,Soul
21146,24098,The Ballad of Reading Gaol,Oscar Wilde,Pent up in Murderer's Hole?,Hole
21671,24703,Puerto Rican Obituary,Pedro Pietri,Se Habla Espanol,Espanol
24346,27837,Letters to America (An Abecedary),Fred D'Aguiar,I mean certain legends about flight that grow ...,control
31981,36673,from The Ambition of Ghosts: I. Remembering i...,Rosmarie Waldrop,"roll,",roll
37986,43993,Autobiography: New York,Charles Reznikoff,slapping the flag-pole,pole
38980,45093,Song of the Andoumboulou: 138,Nathaniel Mackey,...,toll


**Observation** - Option 1: Looks good. 👍

It seems like the 10 randomly inspected rhymes to `soul` indeed rhyme with the word.


**Problem** To do this on the full data frame it will take `1 second` x 40 427, or around `11 hours` to complete 👎

# So how do we make this better?

* iterate over each last word in the dictironary. Estimated ron time O(n) - and ~ `2.5 ms`
* return the list of all words which rhyme with a given one. Estimated run time O(n) - and ~ `6.2 ms`
* add each word to the word:rhyme_caterogy dictionary. Estimated run time O(n) - and ~ `34.2 ns`
* check if the word exists as a key in the dictionary first before adding it. Estimated run time O(n) and ~ `69.2 ns`

In [22]:
def find_rhyme_categories():
    for last_word in df.Last_Word:
        j = 0
        
%timeit find_rhyme_categories()

2.53 ms ± 19.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [45]:
%timeit pronouncing.rhymes(last_word)

6.2 µs ± 32 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [46]:
d = {}
%timeit d['key'] = 'value'

34.2 ns ± 0.587 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [47]:
%timeit 'word' in d.keys()

69.2 ns ± 0.614 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


## Combine all of these together and .... ⬇️

In [53]:
word_rhymecategory_dict = {}

def find_rhyme_categories():
    cnt = 0
    for last_word in df.Last_Word:
        if last_word not in word_rhymecategory_dict.keys():
            cnt += 1
            word_rhymecategory_dict[last_word] = last_word
            for rhyme in pronouncing.rhymes(last_word):
                word_rhymecategory_dict[rhyme] = last_word
    print('Added', cnt, 'unique rhyme words to the dictionary.')
        
find_rhyme_categories()

Added 6059 unique rhyme words to the dictionary.


In [55]:
df['Rhyme_Categories'] = df.Last_Word.map(word_rhymecategory_dict)

In [56]:
df

Unnamed: 0.1,Unnamed: 0,Title,Poet,Lines,Last_Word,Rhyme_Categories
0,0,Song of Myself (1892 version),Walt Whitman,1,1,1
1,1,Song of Myself (1892 version),Walt Whitman,"I celebrate myself, and sing myself,",myself,myself
2,2,Song of Myself (1892 version),Walt Whitman,"And what I assume you shall assume,",assume,assume
3,3,Song of Myself (1892 version),Walt Whitman,For every atom belonging to me as good belongs...,you,you
4,4,Song of Myself (1892 version),Walt Whitman,"I loafe and invite my soul,",soul,soul
...,...,...,...,...,...,...
40422,46779,Ordinary Time,Tim Dlugos,"other place, the undescribed",undescribed,undescribed
40423,46781,Ordinary Time,Tim Dlugos,"and indescribable, more various",various,various
40424,46783,Ordinary Time,Tim Dlugos,and cacophonous than voice,voice,voice
40425,46785,Ordinary Time,Tim Dlugos,"can tell or mind conceive,",conceive,receive


In [57]:
df['Rhyme_Categories'].value_counts().shape[0]

6014