In [None]:
from glob import glob
import textract
import requests
import os
import gensim
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
import logging
import pandas as pd 
from IPython.display import display
from wordcloud import WordCloud
import matplotlib.pyplot as plt

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [None]:
PDF_FOLDER = 'data/pdfs'
IMAGES_FOLDER = 'data/images'

partei_pdfs = [
    ('cdu', 'https://www.cdu.de/system/tdf/media/dokumente/170703regierungsprogramm2017.pdf?file=1'),
    ('spd', 'https://www.spd.de/fileadmin/Dokumente/Bundesparteitag_2017/Es_ist_Zeit_fuer_mehr_Gerechtigkeit-Unser_Regierungsprogramm.pdf'),
    ('die-link', 'https://www.die-linke.de/fileadmin/download/wahlen2017/wahlprogramm2017/die_linke_wahlprogramm_2017.pdf'),
    ('afd', 'https://www.afd.de/wp-content/uploads/sites/111/2017/06/2017-06-01_AfD-Bundestagswahlprogramm_Onlinefassung.pdf'),
    ('fdp', 'https://www.fdp.de/sites/default/files/uploads/2017/08/07/20170807-wahlprogramm-wp-2017-v16.pdf'),
    ('gruene', 'https://www.gruene.de/fileadmin/user_upload/Dokumente/BUENDNIS_90_DIE_GRUENEN_Bundestagswahlprogramm_2017.pdf'),
    ('piratenpartei', 'https://www.piratenpartei.de/files/2017/06/Wahlprogramm-BTW2017.pdf'),
    ('wv', 'http://www.wv-leipzig.de/wp-content/uploads/2017/04/FW-Wahlprogramm-2017-Die-anstaendige-Alternative.pdf')
]

def download_file(url, filename):
    res = requests.get(url)
    with open(filename, 'wb') as f:
        f.write(res.content)
os.makedirs(PDF_FOLDER, exist_ok = True)

for partei, pdf_url in partei_pdfs:
    filename = '{}/{}.pdf'.format(PDF_FOLDER, partei)
    if os.path.exists(filename): continue
    print('Downloading: {}'.format(partei))
    download_file(pdf_url, filename)

In [None]:
data= {}
for pdf_file in glob('{}/*.pdf'.format(PDF_FOLDER)):
    partei = pdf_file.split('/')[-1].split('.')[0]
    print('Extracting pdf text: {}'.format(partei))
    text = textract.process(pdf_file)
    data[partei] = text

In [None]:
def get_stopwords(file = 'data/de-stopwords.txt'):
    with open(file) as f:
        return [x.strip() for x in f.read().split('\n') if x.strip() != '']

def tokenize(text, stopwords = get_stopwords()):
    return [token for token in simple_preprocess(text) if token not in stopwords]

def clean_text(text):
    text = text.replace('\n', ' ')
    text = ' '.join([x.strip().lower() for x in text.split()])
    return tokenize(text)

cleaned_data = {}
for partei, text in data.items():
    print('Partei: {}'.format(partei))
    text = text.decode('utf-8')
    text = clean_text(text)
    cleaned_data[partei] = text
    df_words = pd.DataFrame(text, columns = ['word'])
    display(df_words.word.value_counts().head().to_frame())

In [None]:
def wc_plot(words, wc_size = (800, 800), max_words = 50):
    fig, ax = plt.subplots(figsize = (20, 20))
    wordcloud = WordCloud(background_color='white', width = wc_size[0], height = wc_size[1], max_words=max_words).generate(' '.join(words))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    return fig, ax

for partei, words in cleaned_data.items():
    fig, ax = wc_plot(words)
    fig.suptitle('Partei: {}'.format(partei), fontsize = 40)
    fig.savefig('{}/wc_{}.png'.format(IMAGES_FOLDER, partei), dpi = 120)
    plt.close(fig)


In [None]:
d = []
for partei, words in cleaned_data.items():
    for word in words:
        d.append((partei, word))

df_words = pd.DataFrame(d, columns = ['partei', 'word'])
df_partei_lengths = pd.DataFrame({partei: [len(text)] for partei, text in data.items()}).T.rename(columns={0: 'text_length'}).sort_values(by = 'text_length')
fig, ax = plt.subplots(figsize = (12, 6))
df_partei_lengths.plot(kind = 'barh', legend = False, ax = ax)
ax.set_xlabel('# characters')
fig.suptitle('Wahlprogramm text length per partei', fontsize = 16)
fig.savefig('{}/text_lengths.png'.format(IMAGES_FOLDER), dpi = 100)
plt.close(fig)