In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request

In [2]:
def get_states_tables():
    """
    Goes through the website 'http://www.wahlrecht.de/umfragen/laender.htm'
    and extracts the table for states individually, 
    
    Return: a dictionary containing the id names of the states as keywords and the 
            pd dataframes as values.
    """
    tables = {} # {'state': df}

    page = urllib.request.urlopen('http://www.wahlrecht.de/umfragen/laender.htm')
    soup = BeautifulSoup(page, 'html.parser')
    
    # Find the subtables
    states = soup.find_all('th', colspan='10', id=True)
    rows = soup.find_all('tr')
    header = [col.get_text() for col in soup.find_all('th', class_=True, limit=9)]
    #header = []
    
    # Initialize with empty/unimportant values
    table = [] # df
    new_table = pd.DataFrame()
    name = "ignore"
    for row in rows:
        # Start point of a new state
        if row.find('th', colspan='10', id=True) != None:
            table = []
            #new_table = pd.DataFrame()
            name = row.contents[1].get('id')
            #header = row.find_all_next('th', class_=True, limit=9)

        # Read the data of the subtable
        cols = row.find_all('td', rowspan=False)
        cols = [ele.text.strip() for ele in cols]
        #table.append([ele for ele in cols if ele])
        table.append([ele if ele else None for ele in cols])
        
        # End point for each state
        if row.find('th', colspan='10', class_="trenner") != None: 
            # Don't use the information outside the states.
            if name != "ignore" and name != 'hb':
                tables[name] = pd.DataFrame(table, columns=header)
            # Handle differentely the information from hb = Bremen, which is empty.
            elif name == 'hb':
                tables[name] = pd.DataFrame(table, columns=header[:8])
    
    # Add last table, that doesn't have trenner at the end
    tables[name] = pd.DataFrame(table, columns=header)

    return tables

In [4]:
st = get_states_tables()

In [5]:
# Trying other versions to parse, this uses object oriented programming.
# It's not yet suitable for our case, because tables are very messy.

import requests
import pandas as pd
from bs4 import BeautifulSoup

class HTMLTableParser:

    def parse_url(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return [(table['id'],self.parse_html_table(table))\
                for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

        return df