#Code to extract perfect and imperfect rhymes from songs.
Please upload rhyming_dictionaries.pickle and billboard_lyrics.csv before preceeding. They can be found in the same github repository.

Imports necessary packages and libraries

In [1]:
# imports packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

!pip install nltk

import nltk

import pickle
import os
from nltk.tokenize import word_tokenize
nltk.download('punkt') # may be needed the first time
import difflib
import string



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


The methods below were written by Andrei Popescu-Belis, Àlex R. Atrio, Bastien Bernath, Etienne Boisson, Teo Ferrari, Xavier Theimer-Lienhard, Giorgos Vernikos. They were not modified by me.

In [6]:
'''
gets a rhyming dictionary created by the authors listed above.
To run this method, upload rhyming_dictionaries from github.
'''
def get_rhyming_dictionary(path="/content/rhyming_dictionaries (1).pickle"):
    with open(path,"rb") as fd:
        word2rhymes, _, _ = pickle.load(fd)
    return word2rhymes


'''
    Returns an array with two elements: (1) the perfect rhyme and (2) the assonant
    rhyme of the verse (its last word, except punctuations and contractions).
    If the word is not found in the dictionary, it looks for the most similar one.
'''
def verse2rhyme(verse, rhyme_dict):
    punctuation = ['.', ',', ',', ':', ';', '!', '?', ' ', '-', '...', '_']
    verse = verse.replace('’', '\'')
    v = word_tokenize(verse)
    v = [w for w in v if w not in punctuation] # remove all punctuations
    if len(v) == 0:
        return []
    if v[-1] == 'n\'t':             # tokenizer's output on contraction: don't -> do, n't
        final_word = v[-2] + v[-1]  # restore full form (v[-2] is necessarily present)
    elif v[-1] == '\'d':            # for contraction of past participle (Shakespeare!)
        final_word = v[-2] + 'ed'
    else:
        final_word = v[-1]
    final_word = final_word.lower()
    if final_word not in rhyme_dict: # find a similar word that *is* in the dictionary
        similar_words = difflib.get_close_matches(final_word, rhyme_dict.keys(), n=1) # time consuming
        if similar_words == []:
            return ['', ''] # if it couldn't find anything
        else:
            return rhyme_dict[similar_words[0]]
    return rhyme_dict[final_word]

The methods below were created and modified by me.

In [4]:
'''
method to get information about a specific year
'''
def df_year(year):
  df_year = df.loc[df["Year"] == year]
  return df_year

'''
takes in a poem (or song), a rhyming dictionary, and two empty lists.
Returns full lists of all perfect and imperfect rhymes in the song.
Some lines of code taken from Popescu et. al.
'''
def print_all_rhymes(poem, rhyme_dict, types_perfect, types_imperfect):
    global stanzas_total
    global stanzas_examined
    stanzas_total = stanzas_total +1
    num_perfect = 0
    num_imperfect = 0
    num_not = 0
    real = True
    lines = poem.split('\n')
    # only counts imperfect rhmyes in stanzas that fit the requirements of length
    #and not too long-distance (based on prelimilary research from looking at the corpus)
    if len(lines) <2:
      return types_perfect, types_imperfect
    if len(lines) > 6:
      return types_perfect, types_imperfect
    # counts number of stanzas examined
    stanzas_examined = stanzas_examined +1
    #accounts for the case when the lines have unnecssary spaces:
    for index, p in enumerate(lines):
      if p == '':
        lines.remove(lines[index])
    for index, p in enumerate(lines):
      if p.isspace():
        lines.remove(lines[index])
    # gets the rhyming parts of each word
    rhymes = [verse2rhyme(line, rhyme_dict) for line in lines]
    # goes through and compares each line to the other
    for i in range(len(lines)):
        for j in range(len(lines)-1):
            if rhymes[i] == [] or rhymes[j] == []:
              real == True
            elif j >= i:
                real == True
            # Makes to not to count any rhymes that are caused simply by words that are the same
            elif (lines[i].split(" "))[-1] == (lines[j].split(" "))[-1]:
              real = True
            # if statement in case the words do make a perfect rhyme
            elif rhymes[i][0] == rhymes[j][0]:
                num_perfect = num_perfect+1
                if rhymes[i][0] in types_perfect.keys():
                  types_perfect[rhymes[i][0]] = types_perfect[rhymes[i][0]]+1
                else:
                  types_perfect[rhymes[i][0]] = 1
            # if statement in case the words do make an imperfect rhyme
            elif rhymes[i][1] == rhymes[j][1]:
                num_imperfect = num_imperfect+1
                if (rhymes[i][0], rhymes[j][0]) in types_imperfect.keys():
                  types_imperfect[(rhymes[i][0], rhymes[j][0])] = types_imperfect[(rhymes[i][0], rhymes[j][0])] +1
                else:
                  types_imperfect[(rhymes[i][0], rhymes[j][0])] = 1
            else:
                num_not = num_not+1
    return types_perfect, types_imperfect

'''
Method inputs include a song, two emtpy lists, and a couting variable. The method
outputs the imperfect and perfect rhymes and counts if the song was too short.
'''
def one_song_rhymes(Current_song, types_perfect, types_imperfect, not_enough_stanzas):
  punctuation = ['.', ',', ',', ':', ';', '!', '?', ' ', '-', '...', '_']
  stanzas = Current_song.split("\n\n")
  if len(stanzas) ==1:
    stanzas = Current_song.split("\n\r")
  if len(stanzas) <3:
    not_enough_stanzas = not_enough_stanzas+1
    return types_perfect, types_imperfect, not_enough_stanzas
  for stanza in stanzas:
    stanza = stanza.replace('[].,,:!? -..._]', '')
    stanza = stanza.replace('\r', '')
    stanza = stanza.replace('"', '')
    types_perfect, types_imperfect = print_all_rhymes(stanza, word2rhymes, types_perfect, types_imperfect)
  return types_perfect, types_imperfect, not_enough_stanzas

Code to call methods above

In [7]:
#creates the rhyming dictionary
word2rhymes = get_rhyming_dictionary()

#reads the csv
df = pd.read_csv("billboard_lyrics.csv")

#defines global variables
global stanzas_examined
global stanzas_total

# creates variables
df_year = df

types_perfect = {}
types_imperfect = {}
invalid_lyrics = 0
instrumental = 0
not_enough_stanzas = 0

stanzas_examined = 0
stanzas_total = 0

# cleans up a song and then calls the one song rhymes method.
for index, row in df_year.loc[:, ['Lyrics']].iterrows():
  Current_song = row['Lyrics']
  if Current_song.startswith("sorry, we have no") | Current_song.startswith("we do not have the lyrics for") :
    invalid_lyrics = invalid_lyrics+1
    continue
  # accounts for songs with no lyrics
  if (Current_song == "[instrumental]"):
    invalid_lyrics = invalid_lyrics + 1
    instrumental = instrumental +1
  cleaned_lyrics = re.sub(" him: | her: |both:", '', Current_song, flags=re.MULTILINE)
  cleaned_lyrics2 = re.sub("\[.*?\]", '', cleaned_lyrics)
  cleaned_twice = re.sub("[^0-9a-zA-Z\n\r ']+", '', cleaned_lyrics2)
  types_perfect, types_imperfect, not_enough_stanzas = one_song_rhymes(cleaned_twice, types_perfect, types_imperfect, not_enough_stanzas)
  print(index, row)

#prints the perfect and imperfect rhymes and other statistics about the data
print("perfect")
print(types_perfect)
print("imperfect")
print(types_imperfect)
print(index)
print("invalid" + str(invalid_lyrics))
print("instrumental")
print(instrumental)
print("not_enough_stanzas")
print(not_enough_stanzas)
print("total stanzas")
print(stanzas_total)
print("stanzas examined")
print(stanzas_examined)

0 Lyrics    uno, dos, one, two, tres, quatro\nmatty told h...
Name: 0, dtype: object
1 Lyrics    ooh, sugar pie, honey bunch\nyou know that i l...
Name: 1, dtype: object
2 Lyrics    hey look\ni can't get no satisfaction\ni can't...
Name: 2, dtype: object
3 Lyrics    verse 1\n\nwhen i woke up this morning\nyou we...
Name: 3, dtype: object
4 Lyrics    you never close your eyes anymore when i kiss ...
Name: 4, dtype: object
5 Lyrics    when you're alone and life is making you lonel...
Name: 5, dtype: object
6 Lyrics    help, i need somebody\nhelp, not just anybody\...
Name: 6, dtype: object
7 Lyrics    every time i see you lookin' my way\nbaby, bab...
Name: 7, dtype: object
8 Lyrics    you saw me crying in the chapel\nthe tears i s...
Name: 8, dtype: object
9 Lyrics    i've got sunshine\r\non a cloudy day\r\nwhen i...
Name: 9, dtype: object
10 Lyrics    well since she put me down i 've been out doin...
Name: 10, dtype: object
11 Lyrics    trailer for sale or rent, rooms to let, fifty ...


Once the data has been obtained, the following methods sort and organize the data.

In [8]:
'''
code taken from: https://www.geeksforgeeks.org/python-sort-python-dictionaries-by-key-or-value/
'''
def sort_dict(my_dict):
  keys = list(my_dict.keys())
  values = list(my_dict.values())
  sorted_value_index = np.argsort(values)
  my_dict2 = {keys[i]: values[i] for i in sorted_value_index}
  my_dict3 = dict(reversed(list(my_dict2.items())))
  return my_dict3

'''
Sorts a dictionary.
For this method, I ignore which came first in the rhyme, but if you want that,
 just delete the frozenset word. Rhyme counts account for only the last consonant
'''
def consonant_only(my_dict):
  no_vowels = dict()
  for key in my_dict:
    new_key0 = re.sub("[aeiouyh]", "", key[0])
    new_key1 = re.sub("[aeiouyh]", "", key[1])
    if len(new_key0) > 1:
      new_key0 = new_key0[-1]
    if len(new_key1) > 1:
      new_key1 = new_key1[-1]
    new_key = (new_key0, new_key1)
    new_key = frozenset(new_key)
    if new_key in no_vowels.keys():
      no_vowels[new_key] = no_vowels[new_key] + my_dict[key]
    else:
      no_vowels[new_key] = my_dict[key]
  no_vowels3 = sort_dict(no_vowels)
  sorted_dict = {}
  for s in no_vowels3:
    sorted_dict[tuple(s)] = no_vowels3.get(s)

  return sorted_dict

'''
Sorts a dictionary and keeps consonants.
'''
def consonants_only(my_dict):
  no_vowels = dict()
  for key in my_dict:
    new_key0 = re.sub("[aeiouyh]", "", key[0])
    new_key1 = re.sub("[aeiouyh]", "", key[1])
    new_key = (new_key0, new_key1)
    new_key = frozenset(new_key)
    if new_key in no_vowels.keys():
      no_vowels[new_key] = no_vowels[new_key] + my_dict[key]
    else:
      no_vowels[new_key] = my_dict[key]
  no_vowels3 = sort_dict(no_vowels)
  sorted_dict = {}
  for s in no_vowels3:
    sorted_dict[tuple(s)] = no_vowels3.get(s)

  return sorted_dict

'''
sorts the dictionary and keeps consonants unless they match each other
'''
def some_consonants_only(my_dict):
  no_vowels = dict()
  for key in my_dict:
    new_key0 = re.sub("[aeiouy]", "", key[0])
    new_key1 = re.sub("[aeiouy]", "", key[1])
    # for loop repeats three times because matching clusters can be up to three units long
    for p in range(3):
      if new_key0 == 'n' and new_key1 == 'ng':
        break
      if new_key0 == 'ng' and new_key1 == 'n':
        break
      if len(new_key0) > 0 and len(new_key1) > 0:
        if new_key0[0] == new_key1[0]:
          l0 = list(new_key0)
          l1 = list(new_key1)
          l0[0] = ''
          l1[0] = ''
          new_key0 = ''.join(l0)
          new_key1 = ''.join(l1)
    new_key = (new_key0, new_key1)
    new_key = frozenset(new_key)
    if new_key in no_vowels.keys():
      no_vowels[new_key] = no_vowels[new_key] + my_dict[key]
    else:
      no_vowels[new_key] = my_dict[key]
  no_vowels3 = sort_dict(no_vowels)
  sorted_dict = {}
  for s in no_vowels3:
    sorted_dict[tuple(s)] = no_vowels3.get(s)

  return sorted_dict


'''
code to remove the vowels of the perfect rhymes to see which consonants are used
most.
'''
def consonants_only_perfect(my_dict):
  no_vowels = dict()
  for key in my_dict:
    new_key0 = re.sub("[aeiouyh]", "", key)
    new_key = new_key0
    if new_key in no_vowels.keys():
      no_vowels[new_key] = no_vowels[new_key] + my_dict[key]
    else:
      no_vowels[new_key] = my_dict[key]
  no_vowels3 = sort_dict(no_vowels)
  sorted_dict = {}
  for s in no_vowels3:
    sorted_dict[s] = no_vowels3.get(s)
  return sorted_dict

The code below can be used to obtain sorted dictionaries

In [10]:
sorted_perfect = sort_dict(types_perfect)
sorted_imperfect = sort_dict(types_imperfect)
print(sorted_perfect)
print(sorted_imperfect)

{'uw': 56, 'iy': 52, 'ayt': 30, 'ay': 30, 'ey': 25, 'ow': 24, 'aon': 17, 'ahn': 13, 'ehr': 12, 'aynd': 11, 'own': 10, 'aor': 9, 'iyz': 8, 'eys': 8, 'ayd': 8, 'ahv': 8, 'er': 7, 'ihng': 7, 'ihn': 5, 'ayz': 5, 'iyt': 5, 'aym': 4, 'owld': 4, 'eyz': 4, 'owl': 4, 'uwz': 4, 'aed': 3, 'aend': 3, '': 3, 'aet': 3, 'awnd': 3, 'awn': 3, 'aw': 2, 'oyz': 2, 'ays': 2, 'aart': 2, 'uwm': 2, 'aen': 2, 'ehd': 2, 'aost': 2, 'aong': 2, 'ehnt': 1, 'owz': 1, 'aord': 1, 'aat': 1, 'ihps': 1, 'ayn': 1, 'eyd': 1, 'ert': 1, 'erv': 1, 'iyd': 1, 'eyv': 1, 'ahm': 1, 'ihp': 1, 'uwp': 1, 'awt': 1, 'uhd': 1, 'ehnd': 1, 'ehl': 1, 'aek': 1, 'ihl': 1, 'awd': 1, 'uwk': 1, 'aens': 1, 'ayl': 1}
{('ahn', 'ahv'): 9, ('ay', 'ayd'): 9, ('awnd', 'awn'): 9, ('iy', 'iyz'): 8, ('ahv', 'ahn'): 8, ('own', 'owm'): 8, ('iyk', 'iy'): 6, ('aym', 'ay'): 5, ('ayts', 'ays'): 4, ('aon', 'aong'): 4, ('iy', 'iyt'): 4, ('iyt', 'iy'): 4, ('aaz', 'aaks'): 4, ('ahn', 'ahz'): 4, ('owm', 'own'): 4, ('ahn', 'ahm'): 4, ('uw', 'uwl'): 4, ('aos', 'aost'