In [5]:
# Requirements

import os, requests, urllib, zipfile, shutil, stat, re
from bs4 import BeautifulSoup as bs
import xml.etree.ElementTree as ET
from datetime import datetime as date

In [6]:
# utility strings and flags

# strings with data directory path
XML_PATH  = './data/xml/'
TXT_PATH  = './data/txt/'
JSON_PATH = './data/json/'

# flag to clear folders
REFRESH_RAW_DATA = 1

# root domain of German parliament website
root = 'https://www.bundestag.de'
# ajax request url
asin = '/ajax/filterlist/de/services/opendata/488214-488214?limit=10&noFilterSet=true'

In [None]:
# utility functions to clean a directory

def _remove_readonly(fn, path_, excinfo):
    # Handle read-only files and directories
    if fn is os.rmdir:
        os.chmod(path_, stat.S_IWRITE)
        os.rmdir(path_)
    elif fn is os.remove:
        os.lchmod(path_, stat.S_IWRITE)
        os.remove(path_)

def force_remove_file_or_symlink(path_):
    try:
        os.remove(path_)
    except OSError:
        os.lchmod(path_, stat.S_IWRITE)
        os.remove(path_)

def is_regular_dir(path_):
    try:
        mode = os.lstat(path_).st_mode
    except os.error:
        mode = 0
    return stat.S_ISDIR(mode)

def clear_dir(path_):
    if is_regular_dir(path_):
        # Given path is a directory, clear its content
        for filename in os.listdir(path_):
            pathname = os.path.join(path_, filename)
            if is_regular_dir(pathname):
                shutil.rmtree(pathname, onerror=_remove_readonly)
            else:
                force_remove_file_or_symlink(pathname)
    else:
        # Given path is a file or a symlink.
        # Raise an exception here to avoid accidentally clearing the content
        # of a symbolic linked directory.
        raise OSError("Cannot call clear_dir() on a symbolic link")


In [None]:
if REFRESH_RAW_DATA:
    for dir in [XML_PATH, TXT_PATH]:
        # clear directories first
        if os.listdir(dir):
            clear_dir(dir)
            print('%s cleared. Continue.' % dir)
        else:    
            print('%s is already empty. Continue.' % dir)

    # HTTP request
    page = requests.get(root + asin)
    # assign BeautifulSoup object on response content
    soup = bs(page.content)

    # loop all anchor with this class
    for a in soup.find_all(attrs={'class':'bt-link-dokument'}):
        # retrieve plenary zip file from hrefs
        zipcall, _ = urllib.request.urlretrieve(root + a['href'])
        # extract all xml files of this legislature into raw folder
        with zipfile.ZipFile(zipcall, "r") as f:
            f.extractall(XML_PATH)


In [9]:
# iterate over all the .xml files in the folder
for file in os.listdir(os.fsencode(XML_PATH)):

    # decode the name of file
    filename = os.fsdecode(file)

    # work on just .xmls and remove everything else
    if filename.endswith('.xml'):

        # element tree XML parsing : get root
        root = ET.parse(XML_PATH + filename).getroot()

        # TEXT tag first to acquire the plenary body and split into lines
        for child in root.iter('TEXT'):
            # pre-processing document headers
            body = re.sub(r'^(.*Deutscher Bundestag )([^-$]|[^\u2014$]).*$|^\([A-Z]{1}\).*$', '', child.text, 0, re.M)
            # insert a pattern for double new-lines
            body = re.sub(r'((?<!\n)\n{2}(?!\n))', 'LLLLL', body, 0, re.M)
            # de-hyphen and wrap paragraphs
            body = re.sub(r'(^.*[^-]{1})([-]{1}$|)(\n)', r'\1', body, 0, re.M)
            # substitute pattern with double new-lines
            body = re.sub(r'(LLLLL)', r'\n\n', body, 0, re.M)
            # substitute more than three new lines with whitespace
            # probably it is the page header removed before
            body = re.sub(r'(\n{3,})', ' ', body, 0, re.M)

        # DATUM as the date of the plenary for file renaming
        for child in root.iter('DATUM'):
            plenaryDate = date.strptime(child.text, '%d.%m.%Y').strftime('%Y-%m-%d')
            with open(TXT_PATH + plenaryDate + '.txt', 'w') as textfile:
                textfile.write(body)
                
    else: 
        os.remove(XML_PATH + filename)


In [None]:
# Done with the .txt. files
# Now we start to with a single file and sort the JSON out

