In [63]:
#!/usr/bin/env python3

import os
import lxml.etree as etree

In [64]:
def XML_dialogue_concatenater(input_XML_file, output_XML_file, play_counter):

    """Concatenate dialogues a Moliere's play
    
    This function concatenate dialogues of a Molière's play (TEI edition,
    XML format) into an XML file.
    The output XML separates prose from verse dialogues.
    Input XML files come from http://dramacode.github.io/moliere/
    
    Parameter: 'input_XML_file' Molière's play in XML format, TEI edition.
               'output_XML_file' file containing the concatenated dialogues.
    
    Retrun: output_XML_file modified.
    """

    with open(input_XML_file) as f1, open(output_XML_file) as f2:

            print('Opening file: ' + input_XML_file)

            # Parsing input and output files into Element trees and finding useful tags
            input_tree = etree.parse(f1, etree.XMLParser(ns_clean=True, collect_ids=False))
            input_root = input_tree.getroot()
            body = input_root.find('.//text').find('.//body')

            output_tree = etree.parse(f2, etree.XMLParser(ns_clean=True, collect_ids=False))
            output_root = output_tree.getroot()
            prose = output_root.find('.//prose')
            current_prose_play = etree.Element('play' + str(play_counter))
            prose.append(current_prose_play)
            verse = output_root.find('.//verse')
            current_verse_play = etree.Element('play' + str(play_counter))
            verse.append(current_verse_play)

            # Adding <p> tags into the output file (root > prose)
            for element in body.findall('.//p'):
                    if ('<p/>') not in str(etree.tostring(element)):
                            current_prose_play.append(element)

            # Adding <l> tags into the output file (root > verse)
            for element in body.findall('.//l'):
                    if ('<l/>') not in str(etree.tostring(element)):
                            current_verse_play.append(element)
            
            output_tree.write('./XML_format/Precleaned/concatenated_dialogues.xml', pretty_print=True, xml_declaration=True, encoding="utf-8")

In [None]:
play_counter = 0

# Concatenating dialogues of all plays
for filename in os.listdir('./XML_format/Raw/Mixed'):
    XML_dialogue_concatenater('./XML_format/Raw/Mixed/' + filename, './XML_format/Precleaned/concatenated_dialogues.xml', play_counter)
    play_counter += 1

for filename in os.listdir('./XML_format/Raw/Prose'):
    if not any(x in filename for x in ['medecinvolant', 'fourberiesdescapin', 'avare']): # These plays have a specific format (<s> tags)
        XML_dialogue_concatenater('./XML_format/Raw/Prose/' + filename, './XML_format/Precleaned/concatenated_dialogues.xml', play_counter)
        play_counter += 1

for filename in os.listdir('./XML_format/Raw/Verse'):
    if not any(x in filename for x in ['sganarelle']): # Corrupted?
        XML_dialogue_concatenater('./XML_format/Raw/Verse/' + filename, './XML_format/Precleaned/concatenated_dialogues.xml', play_counter)
        play_counter += 1

In [None]:
# Cleaning concateneated_dialogues.xml

with open('./XML_format/Precleaned/concatenated_dialogues.xml') as f:

    tree = etree.parse(f, etree.XMLParser(ns_clean=True, collect_ids=False))
    root = tree.getroot()

    for element in root.getiterator():
        element.attrib.clear()

    tree.write('./XML_format/Precleaned/concatenated_dialogues.xml', pretty_print=True, xml_declaration=True, encoding="utf-8")