# Программирование на языке Python
## Семинар 19. Работа с файлами разных форматов

https://historik.val.se/val/val2018/slutresultat/R/rike/index.html

**Жирный текст**

<b>Жирный текст</b>

<p><b style="color: red;">Какой-то</b> абзац</p>

<table class="nice_table">
    <thead>
        <tr>
            <th>header1</th>
            <th>header2</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>cell1</td>
            <td>cell2</td>
        </tr>
        <tr>
            <td><a href="https://historik.val.se/val/val2018/slutresultat/R/rike/index.html">cell3</a></td>
            <td>cell4</td>
        </tr>
    </tbody>
</table>

In [2]:
import requests
from bs4 import BeautifulSoup, Tag

In [None]:
from bs4 import T

In [2]:
url_main = 'https://historik.val.se/val/val2018/slutresultat/R/rike/index.html'

# send request and get content
response = requests.get(url_main)
content_main = response.content.decode()

response.close()

# parse content
soup_main = BeautifulSoup(content_main)

In [27]:
soup_main.head.find_all('meta')

[<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>,
 <meta content="IE=9; IE=8; IE=7; IE=EDGE" http-equiv="X-UA-Compatible"/>,
 <meta content="width=device-width, initial-scale=1" name="viewport"/>]

In [28]:
soup_main.head.find('meta', {'name': 'viewport'})

<meta content="width=device-width, initial-scale=1" name="viewport"/>

In [30]:
soup_main.head.find('meta', {'name': 'hhh'})

In [None]:
# найдем таблицу

In [17]:
import requests
from warnings import warn
from collections import defaultdict
from time import sleep
from bs4 import BeautifulSoup, Tag


URL = 'https://historik.val.se/val/val2018/slutresultat/R'
MIN_LEVELS = {'onsdagsdistrikt', 'valdistrikt'}
SLEEPTIME = 0.5


def get_table(page: BeautifulSoup) -> Tag:
    table = page.find('table', {'class': 'sorteringsbar_tabell'})

    return table


def parse_table(table: Tag) -> dict:
    info = defaultdict(list)

    for tr in table.find_all('tr'):
        if tr.find('th') is not None:
            cells = [th.text for th in tr.find_all('th')]
            keys = [*cells[:3], cells[6]]
        else:
            cells = [td.text for td in tr.find_all('td')]
            values = [*cells[:3], cells[6]]

            for key, value in zip(keys, values):
                info[key].append(value) 

    return info


def get_page(url: str) -> BeautifulSoup:
    response = requests.get(url)

    if response.status_code == 200:
        content = response.content.decode()
        
        # parse content
        soup = BeautifulSoup(content)
    else:
        warn(f'page for url {url} not parsed, status code {response.status_code}')
        soup = None

    sleep(SLEEPTIME)
    response.close()
    
    return soup


def get_navigation_table(page: BeautifulSoup) -> Tag:
    return page.find('table', {'id': 'oversiktstabell'})


def parse_navigation_table(navigation_table: Tag):
    tds = navigation_table.find_all('td')
    td_active_index = [index for index, td in enumerate(tds) if td.get('class') == ['aktiv']][0]
    
    for td in tds[(td_active_index + 1):]:
        level = td.parent.get('class')[0]
        
        a = td.a

        # get district
        district = a.text

        # get page
        href = a.get('href')
        url = f'{URL}/{href[(href.find("R") + 2):]}'
        page = get_page(url)

        # next iteration if page is none
        if page is None:
            continue
        
        yield level, district, page


def parse_page(page: BeautifulSoup, districts=[]):
    navigation_table = get_navigation_table(page)
    indent = '\t' * len(districts)
    
    navigation_table_gen = parse_navigation_table(navigation_table)

    for level, district, page in navigation_table_gen:
        if level in MIN_LEVELS:
            table = get_table(page)
            parsed_table = parse_table(table)
            TABLES.append((districts + [(level, district)], parsed_table))
            
            print(f'{indent}({level}) {district} table parsed successfully...')
            
        else:
            print(f'{indent}({level}) {district}')
            parse_page(page, districts + [(level, district)])

In [18]:
TABLES = []
url_main = 'https://historik.val.se/val/val2018/slutresultat/R/rike/index.html'

page = get_page(url_main)
parse_page(page)

(riksdagsvalkrets) Blekinge län
	(kommun) Karlshamn
		(valdistrikt) Centrala Asarum table parsed successfully...
		(valdistrikt) Froarp table parsed successfully...
		(valdistrikt) Gustavstorp table parsed successfully...
		(valdistrikt) Horsaryd table parsed successfully...
		(valdistrikt) Hällaryd table parsed successfully...
		(valdistrikt) Högadal table parsed successfully...
		(valdistrikt) Korpadalen table parsed successfully...
		(valdistrikt) Mörrum västra table parsed successfully...
		(valdistrikt) Mörrum östra table parsed successfully...
		(valdistrikt) Prästslätten table parsed successfully...
		(valdistrikt) Skogsborg table parsed successfully...
		(valdistrikt) Stadsporten table parsed successfully...
		(valdistrikt) Svängsta nordöst table parsed successfully...
		(valdistrikt) Svängsta sydväst table parsed successfully...
		(valdistrikt) Torget table parsed successfully...
		(valdistrikt) Tubbaryd table parsed successfully...
		(valdistrikt) Vägga table parsed successfu

In [19]:
len(TABLES)

6325

In [20]:
import pickle
import os

current_dir = os.getcwd()
if not os.path.exists(f'{current_dir}/tmp'):
    os.mkdir('tmp')

with open('tmp/tables.pickle', 'wb') as outfile:
    pickle.dump(TABLES, outfile)

In [None]:
with open('tmp/tables.pickle', 'rb') as infile:
    TABLES = pickle.load(infile)

In [28]:
import numpy as np
import pandas as pd


def process_value_pd(table_dict) -> pd.DataFrame:
    table_df = pd.DataFrame(table_dict)

    return table_df


def process_value_np(table_dict) -> np.ndarray:
    table_np = np.array(list(table_dict.values()))

    return table_np

In [33]:
%%timeit
df = pd.concat([process_value_pd(item[-1]) for item in TABLES])

336 ms ± 3.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [34]:
%%timeit
nps = [process_value_np(item[-1]) for item in TABLES]
df2 = pd.DataFrame(np.hstack(nps).T, columns=['Förk.', 'Parti', 'Antal2018', 'Antal2014'])

79.5 ms ± 1.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [36]:
TABLES[0][0]

[('riksdagsvalkrets', 'Blekinge län'),
 ('kommun', 'Karlshamn'),
 ('valdistrikt', 'Centrala Asarum')]