In [7]:
# Requirements

import os, requests, urllib, zipfile, shutil, stat, re, json
from bs4 import BeautifulSoup as bs
import xml.etree.ElementTree as ET
from datetime import datetime as date
from normality import normalize

In [8]:
# utility strings and flags

# strings with data directory path
XML_PATH  = './data/xml/'
TXT_PATH  = './data/txt/'
JSON_PATH = './data/json/'

# flag to clear folders
REFRESH_RAW_DATA = 1

# root domain of German parliament website
ROOT = 'https://www.bundestag.de'
# ajax request url
ASIN = '/ajax/filterlist/de/services/opendata/488214-488214?limit=10&noFilterSet=true'

In [None]:
# utility functions to clean a directory

def _remove_readonly(fn, path_, excinfo):
    # Handle read-only files and directories
    if fn is os.rmdir:
        os.chmod(path_, stat.S_IWRITE)
        os.rmdir(path_)
    elif fn is os.remove:
        os.lchmod(path_, stat.S_IWRITE)
        os.remove(path_)

def force_remove_file_or_symlink(path_):
    try:
        os.remove(path_)
    except OSError:
        os.lchmod(path_, stat.S_IWRITE)
        os.remove(path_)

def is_regular_dir(path_):
    try:
        mode = os.lstat(path_).st_mode
    except os.error:
        mode = 0
    return stat.S_ISDIR(mode)

def clear_dir(path_):
    if is_regular_dir(path_):
        # Given path is a directory, clear its content
        for filename in os.listdir(path_):
            pathname = os.path.join(path_, filename)
            if is_regular_dir(pathname):
                shutil.rmtree(pathname, onerror=_remove_readonly)
            else:
                force_remove_file_or_symlink(pathname)
    else:
        # Given path is a file or a symlink.
        # Raise an exception here to avoid accidentally clearing the content
        # of a symbolic linked directory.
        raise OSError("Cannot call clear_dir() on a symbolic link")


In [None]:
if REFRESH_RAW_DATA:
    for dir in [XML_PATH, TXT_PATH]:
        # clear directories first
        if os.listdir(dir):
            clear_dir(dir)
            print('%s cleared. Continue.' % dir)
        else:    
            print('%s is already empty. Continue.' % dir)

    # HTTP request
    page = requests.get(ROOT + ASIN)
    # assign BeautifulSoup object on response content
    soup = bs(page.content)

    # loop all anchor with this class
    for a in soup.find_all(attrs={'class':'bt-link-dokument'}):
        # retrieve plenary zip file from hrefs
        zipcall, _ = urllib.request.urlretrieve(root + a['href'])
        # extract all xml files of this legislature into raw folder
        with zipfile.ZipFile(zipcall, "r") as f:
            f.extractall(XML_PATH)


In [9]:
# iterate over all the .xml files in the folder
for file in os.listdir(os.fsencode(XML_PATH)):

    # decode the name of file
    filename = os.fsdecode(file)

    # work on just .xmls and remove everything else
    if filename.endswith('.xml'):

        # element tree XML parsing : get root
        root = ET.parse(XML_PATH + filename).getroot()

        # TEXT tag first to acquire the plenary body and split into lines
        for child in root.iter('TEXT'):
            # pre-processing document headers
            body = re.sub(r'^(.*Deutscher Bundestag )([^-$]|[^\u2014$]).*$|^\([A-Z]{1}\).*$', '\n\n', child.text, 0, re.M)
            # insert a pattern for double new-lines
            body = re.sub(r'((?<!\n)\n{2}(?!\n))', 'LLLLL', body, 0, re.M)
            # de-hyphen and wrap paragraphs
            body = re.sub(r'(^.*[^-]{1})([-]{1}$|)(\n)', r'\1', body, 0, re.M)
            # substitute pattern with double new-lines
            body = re.sub(r'(LLLLL)', r'\n\n', body, 0, re.M)
            # substitute more than three new lines with whitespace
            # probably it is the page header removed before
            body = re.sub(r'(\n{3,})', ' ', body, 0, re.M)

        # DATUM as the date of the plenary for file renaming
        for child in root.iter('DATUM'):
            plenaryDate = date.strptime(child.text, '%d.%m.%Y').strftime('%Y-%m-%d')
            with open(TXT_PATH + plenaryDate + '.txt', 'w') as textfile:
                textfile.write(body)
                
    else: 
        os.remove(XML_PATH + filename)


In [None]:
# done with the .txt. files
# now we start to with a single file and sort the JSON out

In [9]:
# start by inserting the regex

CHAIRS = [u'Vizepräsidentin', u'Vizepräsident', u'Präsident',
          u'Präsidentin', u'Alterspräsident', u'Alterspräsidentin']

SPEAKER_STOPWORDS = ['ich zitiere', 'zitieren', 'Zitat', 'zitiert',
                     'ich rufe den', 'ich rufe die',
                     'wir kommen zur Frage', 'kommen wir zu Frage', 'bei Frage',
                     'fordert', 'fordern', u'Ich möchte',
                     'Darin steht', ' Aspekte ', ' Punkte ', 'Berichtszeitraum']

BEGIN_MARK = re.compile('Beginn: [X\d]{1,2}.\d{1,2} Uhr')
END_MARK = re.compile(
    '(\(Schluss:.\d{1,2}.\d{1,2}.Uhr\).*|Schluss der Sitzung)')
SPEAKER_MARK = re.compile('  (.{5,140}):\s*$')
TOP_MARK = re.compile(
    '.*(rufe.*die Frage|zur Frage|der Tagesordnung|Tagesordnungspunkt|Zusatzpunkt).*')
POI_MARK = re.compile('\((.*)\)\s*$', re.M)
WRITING_BEGIN = re.compile('.*werden die Reden zu Protokoll genommen.*')
WRITING_END = re.compile(
    u'(^Tagesordnungspunkt .*:\s*$|– Drucksache d{2}/\d{2,6} –.*|^Ich schließe die Aussprache.$)')

FP_REMOVE = re.compile(u'(^.*Dr.?( h.? ?c.?)?| (von( der)?)| [A-Z]\. )')

NAME_REMOVE = [u'\\[.*\\]|\\(.*\\)', u'( de[sr])? Abg.? ',
               u'Vizepräsidentin', u'Vizepräsident', u'Präsident',
               u'Präsidentin', u'Alterspräsident', u'Alterspräsidentin',
               u'Liedvortrag', u'Bundeskanzler(in)?', u', Parl\\. .*',
               u', Staatsmin.*', u', Bundesmin.*', u', Ministe.*',
               u'Staatsministers', 'Bundesministers',
               u'Parl. Staatssekretärin',
               u'Ge ?genruf', 'Weiterer Zuruf', 'Zuruf', 'Weiterer',
               u', zur.*', u', auf die', u' an die', u', an .*', u'gewandt']
NAME_REMOVE = re.compile(u'(%s)' % '|'.join(NAME_REMOVE), re.U)

DE_HYPHEN = re.compile(r'([a-z])-([a-z])', re.U)

PARTIES_SPLIT = re.compile(r'(, (auf|an|zur|zum)( die| den )?(.* gewandt)?)')
PARTIES_REGEX = {
    'cducsu': re.compile(' cdu ?(csu)?'),
    'spd': re.compile(' spd'),
    'linke': re.compile(' (die|der|den) linken?'),
    'fdp': re.compile(' fdp'),
    'gruene': re.compile(' bund ?nis\-?(ses)? ?90 die gru ?nen'),
}


In [11]:
# iterable class for parsing a plenary
class SpeechParser(object):

    # constructor
    def __init__(self, lines):
        self.lines = lines
        self.missing_recon = False

    # parser of point of interest
    def parse_pois(self, group):
        for poi in group.split(' - '):
            text = poi
            speaker_name = None
            sinfo = poi.split(': ', 1)
            if len(sinfo) > 1:
                speaker_name = sinfo[0]
                text = sinfo[1]
            # return generator object
            yield (speaker_name, text)

    # iterator
    def __iter__(self):
        self.in_session = False
        speaker = None
        in_writing = False
        chair_ = [False]
        text = []

        def emit(reset_chair=True):

            # information in dictionary structure
            data = {
                'speaker': speaker,
                'in_writing': in_writing,
                'type': 'chair' if chair_[0] else 'speech',
                'text': "\n\n".join(text).strip()
            }

            if reset_chair:
                chair_[0] = False
            [text.pop() for i in range(len(text))]
            return data

        # iterate over lines within the iterable
        for line in self.lines:

            # removing leading and trailing whitespaces
            rline = line.strip()

            ''' 
            check whether I am not in session and there is a
            begin session mark, skip otherwise
            in this way, we get rid of verbalization lines
            '''

            if not self.in_session and BEGIN_MARK.match(line):
                self.in_session = True
                continue
            elif not self.in_session:
                continue

            # skip everything after the ending 'sitzung'
            if END_MARK.match(rline):
                return

            '''
            starting flagging top marks and stopwords
            without skipping any line
            '''

            if WRITING_BEGIN.match(rline):
                in_writing = True
            if WRITING_END.match(rline):
                in_writing = False

            # skip the line of text when empty
            if not len(rline):
                continue

            # fleg whether it is a top mark
            is_top = False
            if TOP_MARK.match(line):
                is_top = True

            # flag whether the line has a stopword
            has_stopword = False
            for sw in SPEAKER_STOPWORDS:
                if sw.lower() in line.lower():
                    has_stopword = True

            '''
            end of flagging, start checking lines
            '''

            # check if the line embeds a speaker
            m = SPEAKER_MARK.match(line)
            # if there is a speaker and it is not
            # the first line or the list line of the speech (?)
            if m is not None and not is_top and not has_stopword:
                
                # create output dictionary
                if speaker is not None:
                    yield emit()

                # save speaker information from this line
                _speaker = m.group(1)
                role = line.strip().split(' ')[0]
                speaker = _speaker
                chair_[0] = role in CHAIRS
                continue

            # match stripped line for the point of interest
            # 'answers to the speaker'
            m = POI_MARK.match(rline)
            if m is not None:
                if not m.group(1).lower().strip().startswith('siehe'):
                    yield emit(reset_chair=False)
                    in_writing = False
                    for _speaker, _text in self.parse_pois(m.group(1)):

                        # get information about 'answer to the speaker'
                        yield {
                            'speaker': _speaker,
                            'in_writing': False,
                            'type': 'poi',
                            'text': _text
                        }
                        
                    continue
            
            # add line to text when there is no special case
            text.append(rline)

        yield emit()


In [12]:
# Other functions for translating the body

'''
def file_metadata(filename):
    fname = os.path.basename(filename)
    return int(fname[:2]), int(fname[2:5])
'''

names = set()

'''
converts all special characters for whitespaces
dashes and newlines
'''
def clean_text(text):
    if not isinstance(text, str):
        try:
            text = text.decode('utf-8')
        except:
            text = text.decode('latin-1')
    text = text.replace('\r', '\n')
    text = text.replace(u'\xa0', ' ')
    text = text.replace(u'\x96', '-')
    text = text.replace(u'\u2014', '-')
    text = text.replace(u'\u2013', '-')
    return text

'''
cleans denominations of speakers of the parliament that
have particular functions (e.g. ministers, presidents ...)
'''
def clean_name(name):
    if name is None:
        return name
    name = NAME_REMOVE.sub('', name)
    name = DE_HYPHEN.sub(r'\1\2', name)
    name = name.strip('-')
    return name.strip()

'''
remove fingerprints as name prefixes and titles
'''
def fingerprint(name):
    if name is None:
        return
    name = FP_REMOVE.sub(' ', name.strip())
    return normalize(name).replace(' ', '-')

'''
find political party of the mentioned speaker
'''
def search_party_names(text):
    if text is None:
        return
    text = PARTIES_SPLIT.split(text)
    text = normalize(text[0])
    parties = set()
    for party, rex in PARTIES_REGEX.items():
        if rex.findall(text):
            parties.add(party)
    if not len(parties):
        return
    parties = ':'.join(sorted(parties))
    return parties


'''
function to parse the full transcript of text
from the generated .txt file using the iterable
'''
def parse_transcript(filename):
    with open(filename, 'rb') as fh:
        text = clean_text(fh.read())

    # initialize JSON mask
    base_data = {
        'date' : filename,
    }

    # print the date of protokolle that extracts
    print("Loading plenary protocol of date %s" % (filename))
    # initialize sequence of speeches
    seq = 1
    # declare SpeechParser iterable object 
    parser = SpeechParser(text.split('\n'))

    # call iterator method
    for contrib in parser:
        # add cleaned speaker and party to contribution
        contrib['speaker_cleaned'] = clean_name(contrib['speaker'])
        contrib['speaker_fp'] = fingerprint(contrib['speaker_cleaned'])
        contrib['speaker_party'] = search_party_names(contrib['speaker'])
        # add full contribution to JSON as dictionary
        base_data[seq] = contrib
        seq += 1
    
    with open(filenamejson, 'w', encoding='utf-8') as fh:
        json.dump(base_data, fh, ensure_ascii=False, indent=4)



In [None]:

# we do not need to fetch all the protocols as we did it before
def fetch_protokolle():
    for d in TXT_DIR, OUT_DIR:
        try:
            os.makedirs(d)
        except:
            pass

    urls = set()
    res = requests.get(INDEX_URL)
    doc = html.fromstring(res.content)
    for a in doc.findall('.//a'):
        url = urljoin(INDEX_URL, a.get('href'))
        if url.endswith('.txt'):
            urls.add(url)

    for i in range(30, 260):
        url = ARCHIVE_URL % i
        urls.add(url)

    for url in urls:
        txt_file = os.path.join(TXT_DIR, os.path.basename(url))
        txt_file = txt_file.replace('-data', '')
        if os.path.exists(txt_file):
            continue

        r = requests.get(url)
        if r.status_code < 300:
            with open(txt_file, 'wb') as fh:
                fh.write(r.content)

            print(url, txt_file)


In [None]:
for filename in os.listdir(TXT_PATH):
        parse_transcript(os.path.join(TXT_PATH, filename))


In [28]:
# test parse_transcript

filename = '2011-09-08'

with open(TXT_PATH + '/' + filename + '.txt', 'rb') as fh:
    text = clean_text(fh.read())

# initialize JSON mask
json_data = {
    'date' : filename,
}

# print the date of protokolle that extracts
print("Loading plenary protocol of date %s" % (filename))

# initialize sequence of speeches
seq = 1
# declare SpeechParser iterable object 
parser = SpeechParser(text.split('\n'))

# call iterator method
for contrib in parser:
    # modify JSON mask
    # contrib.update(base_data)
    contrib['sequence'] = seq
    contrib['speaker_cleaned'] = clean_name(contrib['speaker'])
    contrib['speaker_fp'] = fingerprint(contrib['speaker_cleaned'])
    contrib['speaker_party'] = search_party_names(contrib['speaker'])
    json_data[seq] = contrib
    seq += 1
    # table.insert(contrib)



Loading plenary protocol of date 2011-09-08


In [29]:
json_data

{'date': '2011-09-08',
 1: {'speaker': None,
  'in_writing': False,
  'type': 'speech',
  'text': 'Guten Morgen, liebe Kolleginnen und Kollegen! Ichhabe zu Beginn einige amtliche Mitteilungen zu machen.\n\nZuerst geht es um Nachwahlen zu Gremien, und zwarzunächst zum Stiftungsrat der Stiftung zur Aufarbeitung der SED-Diktatur. Die Fraktion der SPD schlägtals neues ordentliches Mitglied aus dem Kreis der Fraktionen den Kollegen Siegmund Ehrmann vor. Neuesordentliches Mitglied aus dem Kreis der Personen, die inFragen der Aufarbeitung besonders engagiert sind, sollanstelle von Professor Hermann Weber der frühere Abgeordnete Markus Meckel werden. Sind Sie mit diesenVorschlägen einverstanden? - Das ist offensichtlich derFall. Dann sind der Kollege Siegmund Ehrmann undHerr Markus Meckel hiermit in den Stiftungsrat gewählt.\n\nDie Fraktion Bündnis 90/Die Grünen hat mitgeteilt,dass die Kollegin Dr. Valerie Wilms für den aus demDeutschen Bundestag ausgeschiedenen KollegenWinfried Hermann neues 

In [18]:
seq

3067