In [1]:
#!/usr/bin/env python3

import sys
import os
import lxml.etree as etree
import pandas as pd

In [None]:
def XML_dialogue_concatenater(input_XML_file, output_XML_file):

    """Concatenate dialogues a Moliere's play
    
    This function concatenate dialogues of a Molière's play (TEI edition,
    XML format) into an XML file.
    The output XML separates prose from verse dialogues.
    Input XML files come from http://dramacode.github.io/moliere/
    
    Parameter: 'input_XML_file' Molière's play in XML format, TEI edition.
               'output_XML_file' file containing the concatenated dialogues.
    
    Retrun: output_XML_file modified.
    """

    with open(input_XML_file) as f1, open(output_XML_file) as f2:

            # Parsing input and output files into Element trees and finding useful tags
            input_tree = etree.parse(f1)
            input_root = input_tree.getroot()
            body = input_root.find('.//text').find('.//body')

            output_tree = etree.parse(f2)
            output_root = output_tree.getroot()
            prose = output_root.find('.//prose')
            verse = output_root.find('.//verse')

            # Adding <p> tags into the output file (root > prose)
            for element in body.findall('.//p'):
                    if ('<p/>') not in str(etree.tostring(element)):
                            prose.append(element)
            prose.insert(0, etree.Element('end')) # Empty tag indicating the end of the play

            # Adding <l> tags into the output file (root > verse)
            for element in body.findall('.//l'):
                    if ('<l/>') not in str(etree.tostring(element)):
                            verse.append(element)
            verse.insert(0, etree.Element('end')) # Empty tag indicating the end of the play
            
            return f2

In [None]:
# Concatenating dialogues of all plays
for filename in os.listdir('.XML_format/Raw/Mixed'):
    XML_dialogue_concatenater(filename, './XML_format/Precleaned/concatenated_dialogues.xml')

for filename in os.listdir('.XML_format/Raw/Prose'):
    if not ['medecinvolant','fourberiesdescapin', 'avare'] in filename: # These plays have a specific format (<s> tags)
        XML_dialogue_concatenater(filename, './XML_format/Precleaned/concatenated_dialogues.xml')

for filename in os.listdir('.XML_format/Raw/Verse'):
    XML_dialogue_concatenater(filename, './XML_format/Precleaned/concatenated_dialogues.xml')