In [7]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

ru_url = 'kv22?xjazyk=CZ&xid=1&xv=11'

In [192]:
def get_url(specific_url):
    base_url = 'https://www.volby.cz/pls/kv2022/'
    response = requests.get(base_url + specific_url)
    soup = bs(response.content)
    return response, soup


def read_all_tables(response):
    return pd.read_html(response.text)


def get_ru_names(soup):
    return [n.text for n in soup.find_all('h3', class_='kraj_ciselnik')]


def get_ru_hrefs(soup):
    hrefs = [t.find_all('a', href=True) for t in soup.find_all('table', class_='table')]
    urls = [list(map(lambda x: x['href'], h)) for h in hrefs ]
    return urls


def get_city_hrefs(soup):
    hrefs = [t.find('a', href=True) for t in soup.find_all('td', class_='cislo')]
    urls = [h['href'] for h in hrefs]
    return urls


def scrape_regional_unit():
    ru_response, ru_soup = get_url(ru_url)
    ru_tables = read_all_tables(ru_response)
    ru_names = get_ru_names(ru_soup)
    ru_city_url = get_ru_hrefs(ru_soup)
    
    for ru_t, ru_n, ru_cu in zip(ru_tables, ru_names, ru_city_url):
        ru_t.columns = ru_t.columns.droplevel(0)
        ru_t.drop('Výběrobce', inplace=True, axis=1)
        ru_t.columns = ['city_code', 'city_name']
        ru_t.set_index('city_code', inplace=True, drop=True)
        ru_t['ru_name'] = ru_n
        ru_t['city_url'] = ru_cu
    
    return pd.concat(ru_tables)


def scrape_city(regional_units):
    ci_tables = []
    for i, row in ru_table_all.iterrows():
        ci_response, ci_soup = get_url(row['city_url'])
        ci_table = read_all_tables(ci_response)[0]
        ci_candidate_url = get_city_hrefs(ci_soup)

        ci_table.columns = ci_table.columns.droplevel(0)
        ci_table['city_code'] = i
        ci_table['candidate_url'] = ci_candidate_url
        ci_tables.append(ci_table)
        
    final_table = pd.concat(ci_tables)
    final_table.drop('Volebníobvody', inplace=True, axis=1)
    final_table.columns = ['precinct_code', 'precinct_name', 'council_type', 'city_code', 'candidate_url']
    final_table.set_index('precinct_code', inplace=True, drop=True)
        
    return final_table


def scrape_candidate(cities):
    can_tables = []
    for i, row in cities.iterrows():
        tmp_response, tmp_soup = get_url(row['candidate_url'])
        if tmp_soup.find('td') is not None:
            tmp_url = tmp_soup.find('td').find('a', href=True)['href']
            can_response, can_soup = get_url(tmp_url)
            can_table = read_all_tables(can_response)[0]

            can_table.columns = can_table.columns.droplevel(0)
            can_table['precinct_code'] = i
            can_tables.append(can_table)
        else:
            pass
        
    return pd.concat(ci_tables)

In [156]:
ru_table_all = scrape_regional_unit()
print(ru_table_all.shape)
ru_table_all.head()

(77, 3)


Unnamed: 0_level_0,city_name,ru_name,city_url
city_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CZ0100,Praha,Hlavní město Praha,kv222?xjazyk=CZ&xid=1&xv=11&xnumnuts=1100
CZ0201,Benešov,Středočeský kraj,kv222?xjazyk=CZ&xid=1&xv=11&xnumnuts=2101
CZ0202,Beroun,Středočeský kraj,kv222?xjazyk=CZ&xid=1&xv=11&xnumnuts=2102
CZ0203,Kladno,Středočeský kraj,kv222?xjazyk=CZ&xid=1&xv=11&xnumnuts=2103
CZ0204,Kolín,Středočeský kraj,kv222?xjazyk=CZ&xid=1&xv=11&xnumnuts=2104


In [157]:
ci_table_all = scrape_city(ru_table_all)
print(ci_table_all.shape)
ci_table_all.head()

(6390, 4)


Unnamed: 0_level_0,precinct_name,council_type,city_code,candidate_url
precinct_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
554782,Praha hl.m.,4,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=4&xnumnuts=11...
500054,Praha 1,5,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=5&xnumnuts=11...
500224,Praha 10,5,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=5&xnumnuts=11...
547034,Praha 11,5,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=5&xnumnuts=11...
547107,Praha 12,5,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=5&xnumnuts=11...


In [193]:
can_table_all = scrape_candidate(ci_table_all)
print(can_table_all.shape)
can_table_all.head()

(6390, 6)


Unnamed: 0,číslo,název,Druhzastupitelstva,ru_code,candidate_url,Volebníobvody
0,554782,Praha hl.m.,4,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=4&xnumnuts=11...,
1,500054,Praha 1,5,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=4&xnumnuts=11...,
2,500224,Praha 10,5,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=4&xnumnuts=11...,
3,547034,Praha 11,5,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=4&xnumnuts=11...,
4,547107,Praha 12,5,CZ0100,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=4&xnumnuts=11...,


Unnamed: 0_level_0,precinct_name,council_type,city_code,candidate_url
precinct_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
546372,Buková,1,CZ0324,kv2211?xjazyk=CZ&xid=1&xv=11&xdz=1&xnumnuts=32...
