# Программирование на языке Python
## Семинар 19. Работа с файлами разных форматов

https://historik.val.se/val/val2018/slutresultat/R/rike/index.html

**Жирный текст**

<b>Жирный текст</b>

<p><b style="color: red;">Какой-то</b> абзац</p>

<table class="nice_table">
    <thead>
        <tr>
            <th>header1</th>
            <th>header2</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>cell1</td>
            <td>cell2</td>
        </tr>
        <tr>
            <td><a href="https://historik.val.se/val/val2018/slutresultat/R/rike/index.html">cell3</a></td>
            <td>cell4</td>
        </tr>
    </tbody>
</table>

In [6]:
import requests
from bs4 import BeautifulSoup, Tag

In [20]:
url_main = 'https://historik.val.se/val/val2018/slutresultat/R/rike/index.html'

# send request and get content
response = requests.get(url_main)
content_main = response.content.decode()

response.close()

# parse content
soup_main = BeautifulSoup(content_main)

In [27]:
soup_main.head.find_all('meta')

[<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>,
 <meta content="IE=9; IE=8; IE=7; IE=EDGE" http-equiv="X-UA-Compatible"/>,
 <meta content="width=device-width, initial-scale=1" name="viewport"/>]

In [28]:
soup_main.head.find('meta', {'name': 'viewport'})

<meta content="width=device-width, initial-scale=1" name="viewport"/>

In [30]:
soup_main.head.find('meta', {'name': 'hhh'})

In [None]:
# найдем таблицу

In [31]:
table = soup_main.find('table', {'class': 'sorteringsbar_tabell'})

In [44]:
from collections import defaultdict

def parse_table(table: Tag) -> dict:
    info = defaultdict(list)

    for tr in table.find_all('tr'):
        if tr.find('th') is not None:
            cells = [th.text for th in tr.find_all('th')]
            keys = [*cells[:3], cells[6]]
        else:
            cells = [td.text for td in tr.find_all('td')]
            values = [*cells[:3], cells[6]]

            for key, value in zip(keys, values):
                info[key].append(value) 

    return info

In [33]:
type(table)

bs4.element.Tag

In [38]:
table.find_all('tr')[0].th.text

'Förk.'

In [46]:
type(soup_main)

bs4.BeautifulSoup

In [47]:
url_template = 'https://historik.val.se/val/val2018/slutresultat/R/'

def load_next(page: BeautifulSoup):
    navigation_table = page.find('table', {'id': 'oversiktstabell'})

    return navigation_table

In [1]:
# districts: land ->        lan       -> kommun -> valdistrikt
# classes:   rike -> riksdagsvalkrets -> kommun -> valdistrikt

In [25]:
URL = 'https://historik.val.se/val/val2018/slutresultat/R'

def iter_navigation_table(navigation_table: Tag):
    tds = navigation_table.find_all('td')
    td_activ_index = [index for index, td in enumerate(tds) if td.get('class') == ['aktiv']][0]

    for td in tds[(td_activ_index + 1):]:
        a = td.a

        # get district
        district = a.text

        # get page
        href = a.get('href')
        url = f'{URL}/{href[(href.find("R") + 2):]}'
        page = get_page(url)

        yield district, page

In [27]:
nt_gen = iter_navigation_table(nt)
nt_iter = iter(nt_gen)

In [28]:
next(nt_iter)

Parsing page for url https://historik.val.se/val/val2018/slutresultat/R/rvalkrets/10/index.html OK


('Blekinge län',
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 
 <html lang="sv" xml:lang="sv" xmlns="http://www.w3.org/1999/xhtml">
 <head>
 <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
 <meta content="IE=9; IE=8; IE=7; IE=EDGE" http-equiv="X-UA-Compatible"/>
 <meta content="width=device-width, initial-scale=1" name="viewport"/>
 <title>Blekinge län - Röster - Val 2018</title>
 <link href="../../../../css/visistil.css" media="screen" rel="stylesheet" type="text/css"/>
 <link href="../../../../css/visistil_print.css" media="print" rel="stylesheet" type="text/css"/>
 <script src="../../../../javascript/toggla.js" type="text/javascript"></script>
 <script src="../../../../javascript/extpres.js" type="text/javascript"></script>
 <script src="../../../../javascript/tablesort.js" type="text/javascript"></script>
 <script type="text/javascript">
 window.onload=function(){
     initMouseovers

In [19]:
u = 'https://historik.val.se/val/val2018/slutresultat/R/rike/index.html'
# u = 'https://historik.val.se/val/val2018/slutresultat/R/kommun/10/82/index.html'

p = get_page(u)

nt = get_navigation_table(p)

Parsing page for url https://historik.val.se/val/val2018/slutresultat/R/rike/index.html OK


In [31]:
def get_level(navigation_table: Tag) -> str:
    td_activ = navigation_table.find('td', {'class': 'aktiv'})
    
    return td_activ.parent.get('class')[0]

In [32]:
get_level(nt)

'rike'

In [20]:
iter_navigation_table(nt)

[0]

In [24]:
nt.find_all('td')[0].a.get('href')

'../../../slutresultat/R/rike/index.html'

In [13]:
nt.find_all('td')[1].get('class')

In [1]:
from bs4 import BeautifulSoup

from getters import get_table, get_navigation_table, get_page
from parsers import parse_table
from iterators import iter_navigation_table
from consts import URL, MIN_LEVELS


def parse_page(page: BeautifulSoup, districts=[]):
    navigation_table = get_navigation_table(page)
    indent = '\t' * len(districts)
    
    navigation_table_gen = iter_navigation_table(navigation_table)

    for level, district, page in navigation_table_gen:
        if level in MIN_LEVELS:
            table = get_table(page)
            parsed_table = parse_table(table)
            TABLES.append((districts + [(level, district)], parsed_table))
            
            print(f'{indent}({level}) {district} table parsed successfully...')
            
        else:
            print(f'{indent}({level}) {district}')
            parse_page(page, districts + [(level, district)])

In [2]:
url_main = 'https://historik.val.se/val/val2018/slutresultat/R/rike/index.html'
TABLES = []

page = get_page(url_main)
parse_page(page)



  soup = BeautifulSoup(content)


(riksdagsvalkrets) Blekinge län
	(kommun) Karlshamn
		(valdistrikt) Centrala Asarum table parsed successfully...
		(valdistrikt) Froarp table parsed successfully...
		(valdistrikt) Gustavstorp table parsed successfully...
		(valdistrikt) Horsaryd table parsed successfully...
		(valdistrikt) Hällaryd table parsed successfully...
		(valdistrikt) Högadal table parsed successfully...
		(valdistrikt) Korpadalen table parsed successfully...
		(valdistrikt) Mörrum västra table parsed successfully...
		(valdistrikt) Mörrum östra table parsed successfully...
		(valdistrikt) Prästslätten table parsed successfully...
		(valdistrikt) Skogsborg table parsed successfully...
		(valdistrikt) Stadsporten table parsed successfully...
		(valdistrikt) Svängsta nordöst table parsed successfully...
		(valdistrikt) Svängsta sydväst table parsed successfully...
		(valdistrikt) Torget table parsed successfully...
		(valdistrikt) Tubbaryd table parsed successfully...
		(valdistrikt) Vägga table parsed successfu

KeyboardInterrupt: 

In [3]:
import pickle

with open('tmp/tables.pickle', 'rb') as infile:
    TABLES = pickle.load(infile)

In [5]:
len(TABLES)

6325

In [17]:
import numpy as np
import pandas as pd


def process_table_df(table_dict) -> pd.DataFrame:
    table_df = pd.DataFrame(table_dict)

    return table_df


def process_table_np(table_dict) -> np.ndarray:
    table_np = np.array(list(table_dict.values()))

    return table_np

In [25]:
%%timeit
df_final = pd.concat([process_table_df(item[1]) for item in TABLES])

325 ms ± 5.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
%%timeit
nps = [process_table_np(item[1]) for item in TABLES]
df_final = pd.DataFrame(np.hstack(nps).T, columns=['Förk.', 'Parti', 'Antal2018', 'Antal2014'])

70.3 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
array1 = np.array([1, 2, 3, 4])
array2 = np.array([1, 2, 3, 4])[::-1]

In [30]:
np.concatenate([array1, array2])

array([1, 2, 3, 4, 4, 3, 2, 1])

In [31]:
np.hstack([array1, array2])

array([1, 2, 3, 4, 4, 3, 2, 1])

In [33]:
nps[0]

array([['M', 'C', 'L', 'KD', 'S', 'V', 'MP', 'SD', 'FI', 'ÖVR', '\xa0',
        'OGEJ', 'BLANK', 'OG', 'VDT', '\xa0'],
       ['Moderaterna', 'Centerpartiet',
        'Liberalerna (tidigare Folkpartiet)', 'Kristdemokraterna',
        'Arbetarepartiet-Socialdemokraterna', 'Vänsterpartiet',
        'Miljöpartiet de gröna', 'Sverigedemokraterna',
        'Feministiskt initiativ', 'Övriga anmälda partier',
        'Giltiga röster', 'Ogiltiga röster - inte anmälda partier',
        'Ogiltiga röster - blanka', 'Ogiltiga röster - övriga',
        'Valdeltagande', 'Antal röstberättigade'],
       ['162', '37', '37', '53', '519', '82', '32', '311', '8', '11',
        '1252', '\xa0', '13', '\xa0', '1265', '1555'],
       ['164', '28', '19', '29', '604', '60', '54', '217', '19', '6',
        '1200', '\xa0', '10', '\xa0', '1210', '1491']], dtype='<U38')

In [32]:
nps[1]

array([['M', 'C', 'L', 'KD', 'S', 'V', 'MP', 'SD', 'FI', 'ÖVR', '\xa0',
        'OGEJ', 'BLANK', 'OG', 'VDT', '\xa0'],
       ['Moderaterna', 'Centerpartiet',
        'Liberalerna (tidigare Folkpartiet)', 'Kristdemokraterna',
        'Arbetarepartiet-Socialdemokraterna', 'Vänsterpartiet',
        'Miljöpartiet de gröna', 'Sverigedemokraterna',
        'Feministiskt initiativ', 'Övriga anmälda partier',
        'Giltiga röster', 'Ogiltiga röster - inte anmälda partier',
        'Ogiltiga röster - blanka', 'Ogiltiga röster - övriga',
        'Valdeltagande', 'Antal röstberättigade'],
       ['258', '97', '40', '62', '465', '85', '52', '420', '8', '11',
        '1498', '\xa0', '7', '1', '1506', '1699'],
       ['260', '78', '41', '39', '549', '74', '107', '326', '27', '11',
        '1512', '\xa0', '14', '\xa0', '1526', '1716']], dtype='<U38')

In [2]:
train = {
    'level 1': [
        {'level 2': [
            {'level 3': 'value'},
            {'level 3': 'value'},
            {'level 3': 'value'}
        ]},
        {'level 2': [
            {'level 3': 'value'},
            {'level 3': 'value'},
            {'level 3': 'value'}
        ]},
        {'level 2': [
            {'level 3': 'value'},
            {'level 3': 'value'},
            {'level 3': 'value'}
        ]}
    ]
}

In [3]:
def process_value(val):
    return val * 3

def process_structure(structure, level=[]):
    structure_level = list(structure.keys())[0]

    if structure_level == 'level 3':
        key, value = list(structure.items())[0]
        LIST.append((level + [key], process_value(value)))
    else:
        for item in structure[structure_level]:
            process_structure(item, level + [structure_level])

In [4]:
LIST = []

process_structure(train)

In [5]:
LIST

[(['level 1', 'level 2', 'level 3'], 'valuevaluevalue'),
 (['level 1', 'level 2', 'level 3'], 'valuevaluevalue'),
 (['level 1', 'level 2', 'level 3'], 'valuevaluevalue'),
 (['level 1', 'level 2', 'level 3'], 'valuevaluevalue'),
 (['level 1', 'level 2', 'level 3'], 'valuevaluevalue'),
 (['level 1', 'level 2', 'level 3'], 'valuevaluevalue'),
 (['level 1', 'level 2', 'level 3'], 'valuevaluevalue'),
 (['level 1', 'level 2', 'level 3'], 'valuevaluevalue'),
 (['level 1', 'level 2', 'level 3'], 'valuevaluevalue')]