In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request

In [2]:
def get_states_tables():
    """
    Goes through the website 'http://www.wahlrecht.de/umfragen/laender.htm'
    and extracts the table for states individually, 
    
    Return: a dictionary containing the id names of the states as keywords and the 
            pd dataframes as values.
    """
    tables = {} # {'state': df}

    page = urllib.request.urlopen('http://www.wahlrecht.de/umfragen/laender.htm')
    soup = BeautifulSoup(page, 'html.parser')
    
    # Find the subtables
    states = soup.find_all('th', colspan='10', id=True)
    rows = soup.find_all('tr')
    header = [col.get_text() for col in soup.find_all('th', class_=True, limit=9)]

    # Initialize with empty/unimportant values
    table = [] # df
    name = "ignore"
    for row in rows:
        # Start point of a new state
        if row.find('th', colspan='10', id=True) != None:
            table = []
            name = row.contents[1].get('id')
            #header = row.find_all_next('th', class_=True, limit=9)

        # Read the data of the subtable
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        table.append([ele for ele in cols if ele])

        # End point for each state
        if row.find('th', colspan='10', class_="trenner") != None: 
            # Don't use the information outside the states.
            if name != "ignore" and name != 'hb':
                tables[name] = pd.DataFrame(table, columns=header)
            # Handle differentely the information from hb = Bremen, which is empty.
            elif name == 'hb':
                tables[name] = pd.DataFrame(table)
    # Add last table, that doesn't have trenner at the end
    tables[name] = pd.DataFrame(table, columns=header)

    return tables

In [3]:
states = get_states_tables()