# Parsing votes
> Downloading & parsing votes Aafter downloading xlsx files behind the links on `https://www.bundestag.de/parlament/plenum/abstimmung/liste`.

In [None]:
%load_ext autoreload
%autoreload 2

## Setup

In [None]:
from fastcore.all import *
from bundestag import html_parsing as hp

## Collecting URIs for `.xlsx`/`.xls` documents from `.htm` files

`.xlsx` / `.xls` will be referred as "sheet" files.

In [None]:
html_path = Path('../website_data')
sheet_path = Path('../sheets')

In [None]:
html_file_paths = hp.get_file_paths(html_path, pattern=hp.RE_HTM)
html_file_paths[:3]

In [None]:
hp.test_file_paths(html_file_paths, html_path)

In [None]:
%%time
sheet_uris = hp.collect_sheet_uris(html_file_paths)
list(sheet_uris.items())[:3], list(sheet_uris.items())[-3:]

In [None]:
hp.test_sheet_uris(sheet_uris)

## Downloading sheet files

In [None]:
%%time
uri = sheet_uris['10.09.2020: Abstrakte Normenkontrolle - Düngeverordnung (Beschlussempfehlung)']
hp.download_sheet(uri, sheet_path=sheet_path, verbose=True)

In [None]:
%%time
file_title_maps = hp.download_multiple_sheets(sheet_uris, sheet_path=sheet_path, nmax=3)

In [None]:
hp.test_file_title_maps(file_title_maps, sheet_uris)

## Loading sheets into DataFrames

Collecting the `xlsx` and `xls` file names

In [None]:
sheet_files = hp.get_file_paths(sheet_path, pattern=hp.RE_FNAME)
sheet_files

In [None]:
assert len(sheet_files) > 0

Reading files into dataframes

In [None]:
%%time
sheet_file = sheet_files[0]
df = hp.get_sheet_df(sheet_file, file_title_maps=file_title_maps)
df.head().T

In [None]:
hp.test_get_sheet_df(df)

Squishing vote columns

In [None]:
df_squished = hp.get_squished_dataframe(df)
df_squished.head().T

In [None]:
hp.test_squished_df(df_squished, df)

Setting some dtypes

In [None]:
df_squished = hp.set_sheet_dtypes(df_squished)

Loading multiple sheets into dataframes

In [None]:
%%time
df = hp.get_multiple_sheets_df(sheet_files, file_title_maps=file_title_maps)
df.head().T

Doing all the above

In [None]:
%%time
df = hp.get_multiple_sheets(html_path, sheet_path, nmax=3)
df.head().T

Writing to disk

In [None]:
%%time
# df.to_parquet('../roll_call_votes.parquet')