In [36]:
#!/usr/bin/env python3

import os
import lxml.etree as etree

In [37]:
def XML_dialogue_concatenater(input_XML_file, output_XML_file):

    """Concatenate dialogues a Moliere's play
    
    This function concatenate dialogues of a Molière's play (TEI edition,
    XML format) into an XML file.
    The output XML separates prose from verse dialogues.
    Input XML files come from http://dramacode.github.io/moliere/
    
    Parameter: 'input_XML_file' Molière's play in XML format, TEI edition.
               'output_XML_file' file containing the concatenated dialogues.
    
    Retrun: output_XML_file modified.
    """

    with open(input_XML_file) as f1, open(output_XML_file) as f2:

            print('Opening file: ' + input_XML_file)

            # Parsing input and output files into Element trees and finding useful tags
            input_tree = etree.parse(f1, etree.XMLParser(ns_clean=True, collect_ids=False))
            input_root = input_tree.getroot()
            body = input_root.find('.//text').find('.//body')

            output_tree = etree.parse(f2, etree.XMLParser(ns_clean=True, collect_ids=False))
            output_root = output_tree.getroot()
            prose = output_root.find('.//prose')
            verse = output_root.find('.//verse')

            # Adding <p> tags into the output file (root > prose)
            for element in body.findall('.//p'):
                    if ('<p/>') not in str(etree.tostring(element)):
                            prose.append(element)
            prose.insert(0, etree.Element('end')) # Empty tag indicating the end of the play

            # Adding <l> tags into the output file (root > verse)
            for element in body.findall('.//l'):
                    if ('<l/>') not in str(etree.tostring(element)):
                            verse.append(element)
            verse.insert(0, etree.Element('end')) # Empty tag indicating the end of the play
            
            output_tree.write('./XML_format/Precleaned/concatenated_dialogues.xml', pretty_print=True, xml_declaration=True, encoding="utf-8")

In [41]:
count = 0

# Concatenating dialogues of all plays
for filename in os.listdir('./XML_format/Raw/Mixed'):
    XML_dialogue_concatenater('./XML_format/Raw/Mixed/' + filename, './XML_format/Precleaned/concatenated_dialogues.xml')
    count += 1
    print(count)

for filename in os.listdir('./XML_format/Raw/Prose'):
    if not any(x in filename for x in ['medecinvolant', 'fourberiesdescapin', 'avare']): # These plays have a specific format (<s> tags)
        XML_dialogue_concatenater('./XML_format/Raw/Prose/' + filename, './XML_format/Precleaned/concatenated_dialogues.xml')
        count += 1
        print(count)

for filename in os.listdir('./XML_format/Raw/Verse'):
    if not any(x in filename for x in ['sganarelle']): # Corrupted?
        XML_dialogue_concatenater('./XML_format/Raw/Verse/' + filename, './XML_format/Precleaned/concatenated_dialogues.xml')
        count += 1
        print(count)

Opening file: ./XML_format/Raw/Mixed/moliere_princessedelide.xml
1
Opening file: ./XML_format/Raw/Prose/moliere_amourmedecin.xml
2
Opening file: ./XML_format/Raw/Prose/moliere_amantsmagnifiques.xml
3
Opening file: ./XML_format/Raw/Prose/moliere_medecinmalgrelui.xml
4
Opening file: ./XML_format/Raw/Prose/moliere_precieusesridicules.xml
5
Opening file: ./XML_format/Raw/Prose/moliere_monsieurpourceaugnac.xml
6
Opening file: ./XML_format/Raw/Prose/moliere_impromptuversailles.xml
7
Opening file: ./XML_format/Raw/Prose/moliere_critiqueecoledesfemmes.xml
8
Opening file: ./XML_format/Raw/Prose/moliere_mariageforce.xml
9
Opening file: ./XML_format/Raw/Prose/moliere_jalousiedubarbouille.xml
10
Opening file: ./XML_format/Raw/Prose/moliere_sicilien.xml
11
Opening file: ./XML_format/Raw/Prose/moliere_maladeimaginaire.xml
12
Opening file: ./XML_format/Raw/Prose/moliere_comtesseescarbagnas.xml
13
Opening file: ./XML_format/Raw/Prose/moliere_bourgeoisgentilhomme.xml
14
Opening file: ./XML_format/Raw/P