In [1]:
import os
import re
import glob

from lxml import etree
from tqdm.auto import tqdm

from xml_ops import remove_namespaces
from xml_ops import extract_text_from_sisutext

## I. Extract text from XML files

We just ignore the higer structure of a document and extract text from all `<sisuText>` elements.
This should be enough for start.


In [2]:
INPUT_DIR = 'results/xml_sources/state_laws/'
OUTPUT_DIR = 'results/txt_sources/state_laws/'
for input_file in tqdm(glob.glob(f'{INPUT_DIR}*.xml')):

    xml_doc = remove_namespaces(etree.parse(input_file))
    output_file = OUTPUT_DIR + re.sub('xml$', 'txt', input_file.split('/')[-1])
    
    content = xml_doc.xpath('/oigusakt/sisu')
    assert len(content) == 1, 'Expecting a single <sisu> node'
    content = content[0]
    
    # Some documents are empty and reference another document
    if len(content) == 0:
        continue

    document_text = '\n\n\n\n'.join(extract_text_from_sisutext(st_block) for st_block in content.xpath('//sisuTekst'))
    assert all(re.match('^\s*$', st_block.tail if st_block.tail else '') is not None for st_block in content.xpath('//sisuTekst')), 'Unexpected mixed content'

    with open(output_file, 'wt') as output:
        output.write(document_text)

  0%|          | 0/22 [00:00<?, ?it/s]

In [3]:
INPUT_DIR = 'results/xml_sources/local_government_acts/'
OUTPUT_DIR = 'results/txt_sources/local_government_acts/'
for input_file in tqdm(glob.glob(f'{INPUT_DIR}*.xml')):

    xml_doc = remove_namespaces(etree.parse(input_file))
    output_file = OUTPUT_DIR + re.sub('xml$', 'txt', input_file.split('/')[-1])
    
    content = xml_doc.xpath('/oigusakt/sisu')
    assert len(content) == 1, 'Expecting a single <sisu> node'
    content = content[0]
    
    # Some documents are empty and reference another document
    if len(content) == 0:
        continue

    document_text = '\n\n\n\n'.join(extract_text_from_sisutext(st_block) for st_block in content.xpath('//sisuTekst'))
    assert all(re.match('^\s*$', st_block.tail if st_block.tail else '') is not None for st_block in content.xpath('//sisuTekst')), 'Unexpected mixed content'

    with open(output_file, 'wt') as output:
        output.write(document_text)

  0%|          | 0/22 [00:00<?, ?it/s]

In [4]:
INPUT_DIR = 'results/xml_sources/government_regulations/'
OUTPUT_DIR = 'results/txt_sources/government_regulations/'
for input_file in tqdm(glob.glob(f'{INPUT_DIR}*.xml')):

    xml_doc = remove_namespaces(etree.parse(input_file))
    output_file = OUTPUT_DIR + re.sub('xml$', 'txt', input_file.split('/')[-1])
    
    content = xml_doc.xpath('/oigusakt/sisu')
    assert len(content) == 1, 'Expecting a single <sisu> node'
    content = content[0]
    
    # Some documents are empty and reference another document
    if len(content) == 0:
        continue

    document_text = '\n\n\n\n'.join(extract_text_from_sisutext(st_block) for st_block in content.xpath('//sisuTekst'))
    assert all(re.match('^\s*$', st_block.tail if st_block.tail else '') is not None for st_block in content.xpath('//sisuTekst')), 'Unexpected mixed content'

    with open(output_file, 'wt') as output:
        output.write(document_text)

  0%|          | 0/22 [00:00<?, ?it/s]

In [5]:
INPUT_DIR = 'results/xml_sources/government_orders/'
OUTPUT_DIR = 'results/txt_sources/government_orders/'
for input_file in tqdm(glob.glob(f'{INPUT_DIR}*.xml')):

    xml_doc = remove_namespaces(etree.parse(input_file))
    output_file = OUTPUT_DIR + re.sub('xml$', 'txt', input_file.split('/')[-1])
    
    content = xml_doc.xpath('/oigusakt/sisu')
    assert len(content) == 1, 'Expecting a single <sisu> node'
    content = content[0]
    
    # Some documents are empty and reference another document
    if len(content) == 0:
        continue

    document_text = '\n\n\n\n'.join(extract_text_from_sisutext(st_block) for st_block in content.xpath('//sisuTekst'))
    assert all(re.match('^\s*$', st_block.tail if st_block.tail else '') is not None for st_block in content.xpath('//sisuTekst')), 'Unexpected mixed content'

    with open(output_file, 'wt') as output:
        output.write(document_text)

  0%|          | 0/22 [00:00<?, ?it/s]