<a href="https://colab.research.google.com/github/daniel-youn/499_Project/blob/main/499_project_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Download corpus
from collections import defaultdict
import requests
url = "https://raw.githubusercontent.com/sravanareddy/rhymedata/master/english_raw/-.txt"
authors = [
    "more","wyatt","constable","daniel","drayton","fletcher","griffin","jonson","lodge",
    "lovelace","milton","shakespeare","sidney","spenser","smith","dryden","finch","pope",
    "prior","swift","byron","coleridge","goldsmith","shelley","turner","wordsworth",
    "brooke","chesterton","crosland","housman","kipling","thomas",
    ]
text = ""
for author in authors:
  newurl = url.replace('-',author)
  response = requests.get(newurl)
  response.raise_for_status() # Raise an exception for invalid HTTP status codes
  text += "\n" + response.text
  # line_pairs.extend(parse_file(text_data))

# parse corpus and remove puncuation
text = text.replace(',','').replace('.','').replace('?','').replace('!','').replace(';','').replace(':','').replace('"','')
text = text.replace('  ', ' ')
text = text.split("\n")
for i in range(len(text)):
  text[i] = text[i].strip()

# initialize vocab set
# and make line pairs of rhyming lines
vocab = set()
grapheme_pairs = []
for i in range(len(text)):
  line = text[i]
  if (line[:5] != "RHYME") or (text[i+1] != ""): # if not start of new poem
    continue
  line = line.split(" ")
  map = defaultdict(list)
  for j in range(1,len(line)): # get indices for each kind of rhyming lines
    map[line[j]].append(j)
    for w in text[i+j+1].split(' '): # add all words to vocab
      vocab.add(w)

  for vec in map.values(): # for each rhyming lines
    # print(vec)
    for k in range(len(vec)-1): # each pair of lines that rhyme
      # TODO: optional: can remove for more data
      if k%2 == 1: # no duplicates
        continue
      # grapheme_pairs.append([text[i+vec[k]+1].split(' '),text[i+vec[k]+2].split(' ')])
      grapheme_pairs.append([text[i+vec[k]+1],text[i+vec[k+1]+1]])
      # grapheme_pairs.append([text[i+vec[k]+1].split(' ')[-1],text[i+vec[k]+2].split(' ')[-1]])

print("size of vocab:", len(vocab))
print("size of text:", len(text))
print("size of grapheme_pairs:", len(grapheme_pairs))
print(grapheme_pairs[:25])


size of vocab: 42917
size of text: 130741
size of grapheme_pairs: 28870
[['Wise men alway', 'Affirm and say'], ["That best 'tis for a man", 'The business that he can'], ['Diligently', 'For to apply'], ['And in no wise', 'To enterprise'], ['Another faculty', 'Is never like to thrive'], ['For he that will', 'And can no skill'], ['He that hath left', "The hosier's craft"], ['And falleth to making shone', 'His thrift is well-nigh done'], ['The smith that shall', 'To painting fall'], ['A black draper', 'With white paper'], ['To go to writing school', 'I ween shall prove a fool'], ['An old butler', 'Become a cutler'], ['And an old trot', 'That can God wot'], ['Nothing but kiss the cup', 'Till she have soused him up'], ['With her physic', 'Will keep one sick'], ['A man of law', 'That never saw'], ['The ways to buy and sell', 'I pray God speed him well'], ['Weening to rise', 'By merchandise'], ['A merchant eke', 'That will go seek'], ['By all the means he may', 'His money clean away'], ['To fa

In [3]:
!pip install python-Levenshtein
!pip install g2p_en
import itertools
import Levenshtein
import heapq
from g2p_en import G2p
import nltk
from nltk.corpus import cmudict
import numpy as np

nltk.download('cmudict')
phoneme_dict = cmudict.dict()

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.23.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.23.0 (from python-Levenshtein)
  Downloading Levenshtein-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.23.0->python-Levenshtein)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.23.0 python-Levenshtein-0.23.0 rapidfuzz-3.5.2
Collecting g2p_en
  Downloading g2p_en-2.1.0-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [4]:
# G2P WORD TO PHONEME FUNCTIONS
def gp_word_to_phonemes(text):
    g2p = G2p()
    phonemes = g2p(text)
    return ' '.join(phonemes)

def gp_list_to_phonemes(words_list):
    phonemes_list = []
    for word in words_list:
        phonemes = gp_word_to_phonemes(word)
        phonemes_list.append(phonemes)

    return phonemes_list

In [5]:
# CMU DICT WORD TO PHONEME FUNCTIONS
def cmu_word_to_phonemes(word):
    word = word.lower()
    phonemes = phoneme_dict.get(word)

    if phonemes:
        # Choosing the first pronunciation variant
        return ' '.join(phonemes[0])
    else:
        return ""

def cmu_list_to_phonemes(words_list):
    phonemes_list = []

    for word in words_list:
        phonemes = cmu_word_to_phonemes(word)
        phonemes_list.append(phonemes)

    return phonemes_list

In [6]:
# see how many words are in actual dictionary

count = 0
phoneme_dict_keys = phoneme_dict.keys()
for word in vocab:
  if word in phoneme_dict_keys:
    count += 1

print("number of words in dictionary:", count)

number of words in dictionary: 14800


In [22]:
# create grapheme to phoneme dict for our vocab
for word in vocab:
  if word in phoneme_dict_keys:
    print(word)
    print(cmu_word_to_phonemes(word))
    break

for word in vocab:
  if word != "" and word not in phoneme_dict_keys:
    print(word)
    print(gp_word_to_phonemes(word))
    break

sweetly
S W IY1 T L IY0
faire-built
F AY1 R B W IH2 L T


In [None]:
from tqdm import tqdm

g_to_p_dict = {}

# map each word in vocab to its phoneme representation
for word in tqdm(vocab):
    if word in phoneme_dict_keys:
        g_to_p_dict[word] = cmu_word_to_phonemes(word)
    else:
        g_to_p_dict[word] = gp_word_to_phonemes(word)


  1%|          | 433/42917 [04:46<9:19:25,  1.27it/s]

In [None]:
import copy
phoneme_pairs = copy.deepcopy(grapheme_pairs)
for pair in phoneme_pairs:
  for line in pair:
    for word in line
  pass