In [2]:
#!/usr/bin/env python3

import os
import lxml.etree as etree
import pandas as pd
import string
from itertools import tee
import unicodedata

In [4]:
def XML_dialogue_concatenater(input_XML_file, output_XML_file, play_counter):

    """Concatenate dialogues a Moliere's play
    
    This function concatenate dialogues of a Molière's play (TEI edition,
    XML format) into an XML file.
    The output XML separates prose from verse dialogues.
    Input XML files come from http://dramacode.github.io/moliere/
    
    Parameter: 'input_XML_file' Molière's play in XML format, TEI edition.
               'output_XML_file' file containing the concatenated dialogues.
    
    Retrun: output_XML_file modified.
    """

    with open(input_XML_file) as f1, open(output_XML_file) as f2:

            print('Opening file: ' + input_XML_file)

            # Parsing input and output files into Element trees and finding useful tags
            input_tree = etree.parse(f1, etree.XMLParser(ns_clean=True, collect_ids=False))
            input_root = input_tree.getroot()
            body = input_root.find('.//text').find('.//body')

            output_tree = etree.parse(f2, etree.XMLParser(ns_clean=True, collect_ids=False))
            output_root = output_tree.getroot()
            prose = output_root.find('.//prose')
            current_prose_play = etree.Element('play' + str(play_counter))
            prose.append(current_prose_play)
            verse = output_root.find('.//verse')
            current_verse_play = etree.Element('play' + str(play_counter))
            verse.append(current_verse_play)

            # Adding <p> tags into the output file (root > prose)
            for element in body.findall('.//p'):
                    if ('<p/>') not in str(etree.tostring(element)):
                            current_prose_play.append(element)

            # Adding <l> tags into the output file (root > verse)
            for element in body.findall('.//l'):
                    if ('<l/>') not in str(etree.tostring(element)):
                            current_verse_play.append(element)

            # Deleting all tag attributes of the output tree
            for element in output_root.getiterator():
                    element.attrib.clear()
            
            output_tree.write('./XML_format/Precleaned/concatenated_dialogues.xml', pretty_print=True, xml_declaration=True, encoding="utf-8")

In [None]:
# Concatenating dialogues of all plays

play_counter = 0

for filename in os.listdir('./XML_format/Raw/Mixed'):
    XML_dialogue_concatenater('./XML_format/Raw/Mixed/' + filename, './XML_format/Precleaned/concatenated_dialogues.xml', play_counter)
    play_counter += 1

for filename in os.listdir('./XML_format/Raw/Prose'):
    if not any(x in filename for x in ['medecinvolant', 'fourberiesdescapin', 'avare']): # Plays that have a specific format (<s> tags)
        XML_dialogue_concatenater('./XML_format/Raw/Prose/' + filename, './XML_format/Precleaned/concatenated_dialogues.xml', play_counter)
        play_counter += 1

for filename in os.listdir('./XML_format/Raw/Verse'):
    if not any(x in filename for x in ['sganarelle']): # Corrupted?
        XML_dialogue_concatenater('./XML_format/Raw/Verse/' + filename, './XML_format/Precleaned/concatenated_dialogues.xml', play_counter)
        play_counter += 1

In [29]:
# Creating a rhyme DataFrame


def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)


with open('./XML_format/Precleaned/concatenated_dialogues.xml') as f:

    tree = etree.parse(f, etree.XMLParser(ns_clean=True, collect_ids=False))
    root = tree.getroot()
    verse = root.find('.//verse')

    words10 = []
    words20 = []

    for e1, e2 in pairwise(verse.findall('.//l')):

        if e1.text is not None and e2.text is not None:

            # Removing punctuaction, '\xa0' Unicode and converting to lowercase. Listing each words of sentences.
            l1 = list(unicodedata.normalize("NFKD", e1.text.lower().translate(str.maketrans('', '', """!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""))).split(' '))
            l2 = list(unicodedata.normalize("NFKD", e2.text.lower().translate(str.maketrans('', '', """!"#$%&'()*+,./:;<=>?@[\]^_`{|}~"""))).split(' '))

            # Removing empty strings
            l1 = list(filter(None, l1))
            l2 = list(filter(None, l2))

            # Creating list of characters of last words
            last_word1 = list(l1[-1])
            last_word2 = list(l2[-1])

            # Creating list of 3 last characters of last words
            last_letters1 = last_word1[-3:]
            last_letters2 = last_word2[-3:]

            # Checking similarities between last 3 letters of last words
            similarity_counter = 0
            for (letter1, letter2) in zip(last_letters1, last_letters2):
                if letter1 == letter2:
                    similarity_counter += 1

            # Considering a rhyme if at least 2 letters are respectively the same
            if similarity_counter >= 2:
                words10.append(l1[-1])
                words20.append(l2[-1])

    

    # Deleting self-rhymes (tout, tout), (debout, debout), ...
    words11 = []
    words21 = []

    for i in range(len(words10)):
        if words10[i] != words20[i]:
            words11.append(words10[i])
            words21.append(words20[i])

    # Deleting duplucations (tout, debout), (tout, debout), (debout, tout)
    rhymes = []

    for rhyme in zip(words11, words21):
        if (not [rhyme[0], rhyme[1]] in rhymes) and (not [rhyme[1], rhyme[0]] in rhymes):
            rhymes.append([rhyme[0], rhyme[1]])
    
    # Converting into a CSV file
    pd.DataFrame(rhymes).to_csv('./Rhymes/rhymes_set_v2.csv', index=False, header=['Word 1', 'Word 2'])