In [1]:
"""
This script extracts tables from the website 'http://www.wahlrecht.de/umfragen/' 
for each polling firm individually.

Call the function get_tables() will return a dictionary containing the firm names 
as keywords and corresponding Pandas dataframe as values.
"""

"\nThis script extracts tables from the website 'http://www.wahlrecht.de/umfragen/' \nfor each polling firm individually.\n\nCall the function get_tables() will return a dictionary containing the firm names \nas keywords and corresponding Pandas dataframe as values.\n"

In [2]:
import numpy as np
import pandas as pd
import io
import requests
from bs4 import BeautifulSoup
import urllib.request

wahlrecht = 'http://www.wahlrecht.de/umfragen/'

In [3]:
def get_table_from_polling_firm(url):
    """
    extracts tables from the website 'http://www.wahlrecht.de/umfragen/'
    for each polling firm, and stores the tables into Pandas dataframes.
    
    url:    str, the full url of the website, 
            e.g. 'http://www.wahlrecht.de/umfragen/emnid.htm'
    Return: Pandas dataframe
    """
    
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, 'html.parser')

    head = soup.find('thead')
    body = soup.find('tbody')

    table = []
    rows = body.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        table.append([ele for ele in cols if ele]) 

    header = []
    cols = head.find_all('th')
    for col in cols:
        if col.get_text() != '\xa0':
            header.append(col.get_text())
    if header.count('Datum') == 0:
        header.insert(0, 'Datum')

    df = pd.DataFrame(table, columns=header)
    return df

In [179]:
def preprocess(table):
    """
    converts the table that consists of strings into a table containing the correct type
    df: pandas dataframe 
    return: pandas dataframe 
    """
    # drop the column Zeitraum
    table = table.drop('Zeitraum', axis=1)
    # drop the rows containing the true results of the elections
    Idx = np.where(table.Befragte=='Bundestagswahl')[0]
    Idx = np.append(Idx, np.where(table['CDU/CSU'].str.contains('Umfrage'))[0])
    table = table.drop(Idx)
    table.index = np.arange(table.shape[0])
    # replace the strings %,-
    table = table.replace('%', '', regex=True)
    table = table.replace(',', '.', regex=True)
    table = table.replace('[–?]', '', regex=True)
    # fix the column Befragte !!!!!!!!!!!!!!
    table.Befragte = table.Befragte.replace('[T • ?≈O • ]', '', regex=True)
    # replace all empty entries with NaN
    table = table.replace('', 'NaN', regex=True)

    # if the colomn Sonstige contains entries with more than one number
    try: 
        table.Sonstige = table.Sonstige.astype(float)
    except ValueError:
        for i, n in enumerate(table.Sonstige):
            if len(n) > 2:
                digits = np.array([digit for digit in np.arange(10).astype(str) if digit in n])
                table.Sonstige[i] = digits.astype(int).sum()
                table.Sonstige = table.Sonstige.astype(float)

    # convert all numbers to float
    table[table.keys()[1:-1]] = table[table.keys()[1:-1]].astype(float)
    # convert the date to type date
    table.Datum = pd.to_datetime(table.Datum).dt.date
    return table

In [183]:
def get_tables():
    """
    goes through the website 'http://www.wahlrecht.de/umfragen/'
    and extracts the table for all polling firms individually, 
    by using get_table_from_polling_firm(arg).
    
    Return: a dictionary containing the names of polling firms as keywords and the 
            pd dataframes as values.
    """
    
    tables = {}
    
    page = urllib.request.urlopen(wahlrecht)
    soup = BeautifulSoup(page, 'html.parser')

    firms_url = []
    rows = soup.find_all(class_='in')
    for row in rows:
        #print(row)
        link = row.find('a')
        #print(link.get('href'))
        firms_url.append(link.get('href'))

    for url in firms_url:
        key = url.split('.')[0]
        #print(key)
        df = get_table_from_polling_firm(wahlrecht+url)
        #df.to_csv('data/' + url.split('.')[0] + '.csv')
        df = preprocess(df)
        tables[key] = df
    
    return tables

In [184]:
tables = get_tables()

In [182]:
tables

{'allensbach':          Datum  CDU/CSU   SPD  GRÜNE  FDP  LINKE   AfD  Sonstige Befragte
 0   2017-05-26     37.0  26.0    8.0  9.0    8.0   8.0       4.0    1.457
 1   2017-04-25     36.0  31.0    7.0  6.0    9.0   7.0       4.0    1.407
 2   2017-03-28     34.0  33.0    7.5  6.5    8.0   7.0       4.0    1.397
 3   2017-02-22     33.0  30.5    8.0  7.0    8.0   8.5       5.0    1.542
 4   2017-01-26     36.0  23.0    9.0  7.0    9.5  11.5       4.0    1.441
 5   2016-12-22     35.5  22.0   10.0  7.5    9.5  10.5       5.0    1.459
 6   2016-11-16     34.0  23.0   11.0  7.5    9.0  10.5       5.0    1.436
 7   2016-10-20     33.0  22.0   12.0  7.5    9.0  12.5       4.0    1.458
 8   2016-09-22     33.5  24.0   11.0  7.0    7.0  12.5       5.0    1.407
 9   2016-08-24     34.5  23.0   11.5  7.5    9.0  10.0       4.5    1.496
 10  2016-07-21     35.5  22.5   12.0  7.0    9.5   9.5       4.0    1.466
 11  2016-06-15     33.5  21.0   12.0  8.0    9.0  11.5       5.0    1.396
 12  2016-0