This is a compilation of two scripts, fixUnder and Rhyme_Markup. I put them together to see how long it would take to process the whole book (52 poems, about 30 seconds) and because checking a single file for errors is much easier than checking 52 files. I want to maintain the two different scripts for now because it will be easier to adapt them individually to other books and poets and to debug them separately. Dependencies are discussed in those scripts as well.  

In [6]:
import re, subprocess, string
import syllabify_ipa as sipa
from bs4 import BeautifulSoup

file = "texts/under.txt"
outfile = "texts/under-tei.txt"

hashcount = 0     # for counting hashtags to trigger title printing
titlecount = 1    # for counting the poems in book order
linecount = 1     # for counting poem lines
poemlist = None   # for naming individual poem files
booklist = []     # for keeping each poem as a big list
booklist_xml = [] # for keeping the finished versions of poems

def rhyme(wordA, wordB) :
    """ Checks 2 phonetic strings to detect their rhyme."""
    
    VOWELS = sipa.VOWELS
    CONSONANTS = ['p', 'b', 't','d', 'k', 'ɡ',
    'tʃ', 'dʒ',
    'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'h',
    'm', 'n', 'ŋ',
    'l', 'ɹ', 'j', 'w']
    
    # The two input words, wordA and wordB, should be transcribed already.
    # The Vowel list is imported from syllabify-ipa.
    # The Consonanat and Vowel lists are going to be used for comparison only
    # so making them into a set will speed things up.
    
    v_set = set(VOWELS)
    c_set = set(CONSONANTS)

    # Remove stress marks from strings to be compared
    # Note: We might want to know the stress of rimes in the future, but for now
    # I'm gonna make it work by eliminating the stress for comparison's sake.
    
    nostressA = [i.replace('ˈ', '').replace('ˌ','') for i in wordA]
    nostressB = [i.replace('ˈ', '').replace('ˌ','') for i in wordB] 
    
    # use the shortest word as basis for comparison
        
    if len(nostressA) <= len(nostressB) :  
        basis = nostressA
        focus = nostressB
    else : 
        basis = nostressB
        focus = nostressA
        
    segment = []   # holds V/C identity string of matching segments
    rphones = []   # holds the Rhyming Phones
    
    # Work through the shortest word backwards phoneme by phoneme
    
    rfocus = list(reversed(focus)) # make the focus word into a reversed list
    for num, letter in (enumerate(reversed(basis))) : # work through words backwards
 
    # For now, rhyme is defined as exact matches of sequences of phones. 
    # Later we will have to come back and add near/slant rhyme and eye rhyme. 
        
        if letter != rfocus[num] :
            break    # When sounds don't match, we stop comparing
        if letter == rfocus[num] : # if base letter matches focus letter, 
            # print (letter, num)
            rphones.append(letter) # add it to the list of matching letters
            if letter in v_set : 
                segment.append('V')
            elif letter in c_set :
                segment.append('C') # and add C/V to the list of matching segments
            else :
                print (f'Undocumented letter /{letter}/.') # I need to learn to raise error in Python

    # re-reverse the segment and rhyming phones for human readability
    # and make them into strings
                
    segment.reverse() 
    rphones.reverse()
    
    rhymingphones = "".join(rphones)
    rhymetype = "".join(segment)
        
    # Group rhyme types together
    if rhymetype in ['', 'C', 'CC'] :
        group = 'none'
    elif rhymetype in ['VC', 'VCC', 'VCCC', 'CVC', 'CVCC', 'CCVC', 'CCVCC'] :
        group = 'strong'
    elif rhymetype in ['V', 'CV', 'CCV', 'CVCV'] :
        group = 'weak'
    else :
        group = 'unknown'
    
    # return rhyme info -- in future could return any info on rhyme we can make
    return [rhymingphones, rhymetype, group]


with open (file, "r", encoding='utf-8-sig') as f:  # needed encoding to delete the BOM code
    for line in f:               # work through each line at a time
        stripped = line.strip()  # remove newlines from each line
        if stripped == '***' :   # I added a marker (***) to denote the end of source file, to
            if poemlist :        # trigger adding the last poem to the booklist
                booklist.append(poemlist) # add this poem to the whole book of poems
            break
        if '#' in stripped :     # Each title is preceded by a line with a hashmark     
            if poemlist :
                booklist.append(poemlist) # Finish processing previous poem by adding it to booklist
            hashcount += 1       # hashcount 1 will mean we are processing a title  
        if "#" not in stripped and hashcount == 1 :
            title = stripped     # so this line must be a title
            poemlist = []        # initiate a new poem list
            # putting f before a string def allows vars in curly braces in string
            titlecountstring = f'n="{titlecount}">'
            poemlist.append(f'<lg xmlns="http://www.tei-c.org/ns/1.0" type="poem" {titlecountstring}')
            poemlist.append(title)   # first line in each poem is the name-space declaration
            hashcount = 0
            titlecount += 1
            linecount = 1
        if "#" not in stripped and stripped != title and hashcount == 0 :
            if stripped == "" :
                poemlist.append(stripped) # keep the empty line
            else :
                linecountstring = f'n="{linecount}">'
                poemlist.append(f'<l {linecountstring}{stripped}</l>')
                linecount += 1
                
# after making each poem into a list, we will detect and add the rhyme information and xml

for poem in booklist :
    print (f'Working on {poem[1]}')
    # first insert a basic TEI structure into each poem
    
    poem[1] = f'<head>{poem[1]}</head>' # alter title line by adding tags
    
    # stanzas have a blank line between them, so find the blank lines followed by text lines
    blanks = [index for index in range(len(poem)-1) if poem[index] == "" and poem[index+1] != ""]
    
    # Start the first stanza on the third blank line, then insert a stanza tag with number
    # on every blank line after a stanza thereafter.
    for n, blank in enumerate(blanks) :
        if blank == 3 :
            poem[blank] = f'<lg type="stanza" n="{n+1}">'
        else :
            poem[blank] = f'</lg><lg type="stanza" n="{n+1}">'
        
    #put the end tags for the last stanza and whole poem on the blank lines at the end
    poem[-3] = '</lg>'
    poem[-2] = '</lg>'
    
    poem_str = "".join(poem)
    #print (poem_str)

    # Make a list of letters for assigning to rhymes. 
    
    ab_string = string.ascii_uppercase   # Create a string of all uppercase letters
    ab_list = list(ab_string)            # Convert it to a list of all uppercase letters
    ab_list.remove("X")                  # remove the X because we use it for unrhymed lines
    
    # Have BeautifulSoup parse the xml string and create a tree. Alternately,
    # could save the poems to list and make the whole book into a single tree.

    soup = BeautifulSoup(poem_str, 'xml')  #parsing as lxml loses the <head> tag
    stanzas = soup.find_all(attrs={"type" : "stanza"}) #Get all the tags with type=stanza
    
    for stanza in stanzas :
        lastsyllables = []            # list of last syllables in each line of the stanza
        lines = stanza.find_all('l')  # get all the lines in this stanza
        
        for line in lines :
            target = line.text # get the text value using BS and run it through eSpeak
            cp = subprocess.run(['espeak', '-v', 'en-us', '-xq', '--ipa=3', target], 
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            phones = cp.stdout.decode("utf-8").strip() # get eSpeak results
            phones = re.sub("\r\n", "", phones) # remove any newlines in Windows
            words = phones.split(' ') # split the line into words on spaces
            
            # Syllabify the last word split into phones
            sylword = sipa.syllabify(words[-1].split('_'))
            
            # syllabify returns a list of lists [[onset],[nucleus],[coda]]. Codas but not onsets must fully
            # match in order to rhyme. I get the last syllable and push the sounds back together for now.
            lastsyllable = ' '.join(' '.join(''.join(p) for p in syl) for syl in sylword[-1])
            lastsyllable = lastsyllable.strip() # remove f & r spaces
            lastsyllables.append(lastsyllable.split(' ')) # put the last syl on the list of last syls.

            
            phones = re.sub("_", "", phones) # remove underscores to make pretty print
            line.attrs['phon'] = phones #assigns a new attribute to the <line> for the transcription
            
        # each stanza will have its own set of rhymes and rhyme data. This decision can be changed
        # by removing the verse loop and doing the whole poem at once. Right now the same rhyme in
        # different stanzas will get different letters. 

        rime_dict = {}
        skip = [] # list of rhyme tests to skip
        size = len(lastsyllables) # should match num of lines in the verse
        for i in range(size-1) : 
            for j in range(i+1, size) : 
                if j in skip : # skip any rhymes already found
                    continue
                [rime, cv, grp] = rhyme(lastsyllables[i], lastsyllables[j]) # invoke the subroutine
                if grp != 'none' :
                    skip.append(j) # when a rhyme is found, put it on the skip list
                    if rime not in rime_dict :
                        rime_dict[rime] = [cv,grp,i,j]
                    else:
                        rime_dict[rime].extend([i,j]) # this way all the rhymed lines are on one list
        #print(rime_dict)
        #print("Next stanza")
        
        # We have located all the rhymes. Now time to assign them a letter and put
        # them into the TEI markup
        
        completed_lines = []  # to hold list of lines that are marked up so we don't repeat them.
        for l in range(size) :  # loop through the indexes of line numbers
            if l in completed_lines :  # skip any lines that have been assigned letters already
                    continue
                    
            # If a rhyme has been discovered, the line number will be in the list of values
            # associated with the rime. Get that key and use it to get the values again.
            
            rhymed_keys = {key for key, value in rime_dict.items() if l in value}
            rkeylist = list(rhymed_keys)
            if rkeylist != [] :  # if there are some rhymes...
                cv, grp, *found = rime_dict[rkeylist[0]] # convert rime+ list to digits
                found = list(set(found)) # converting to a set() eliminates duplicates
                next_let = ab_list.pop(0) # assign rime the next alphabet letter
                for fi in found : 
                    lines[fi].attrs['type'] = grp
                    lines[fi].attrs['rhyme'] = next_let
                    lines[fi].attrs['rime'] = rkeylist[0]
                    lines[fi].attrs['vc_structure'] = cv
                    completed_lines.append(fi)
                continue
            else :                        # If no rhyme is found, assign it the label X
                next_let = 'X'
                lines[l].attrs['rhyme'] = next_let 
                
    # Add this poem to the list of completed poems
    
    booklist_xml.append(str(soup))
    
with open(outfile, 'w', encoding='utf-8') as p :
    print (f'Now printing all the poems to {outfile}')
    for poem in booklist_xml :
        soup = BeautifulSoup(poem, 'xml')
        print(soup.prettify(), file=p)

#print (cp.stderr) # just in case there are espeak errors

Working on THE SKY
Working on THE CORNFIELD
Working on MILKING TIME
Working on IN MY PILLOW
Working on MISS KATE-MARIE
Working on THE WOODPECKER
Working on THE STAR--A Song
Working on THE BUTTERBEAN TENT
Working on BIG BROTHER
Working on MR. WELLS
Working on DICK AND WILL
Working on THE PILASTER
Working on FIREFLY--A Song
Working on LITTLE RAIN
Working on THE PULPIT
Working on ON THE HILL
Working on AUTUMN
Working on THE RABBIT
Working on CRESCENT MOON
Working on FATHER'S STORY
Working on CHRISTMAS MORNING
Working on PEOPLE GOING BY
Working on BABES IN THE WOODS
Working on THE PICNIC
Working on MUMPS
Working on THE CIRCUS
Working on STRANGE TREE
Working on THE BRANCH
Working on THE WORM
Working on A CHILD ASLEEP
Working on LITTLE BUSH--A Song
Working on AT THE WATER
Working on WATER NOISES
Working on AMONG THE RUSHES
Working on NUMBERS
Working on IN THE NIGHT
Working on THE PEOPLE
Working on THE GRANDMOTHER
Working on IN MARYLAND
Working on THE SUNDAY BONNET
Working on THE SUN AND A BI