# Parsing votes
> Downloading & parsing votes Aafter downloading xlsx files behind the links on `https://www.bundestag.de/parlament/plenum/abstimmung/liste`.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from fastcore.all import *

import bundestag.data as data
import bundestag.data as b_data
import bundestag.data.download.bundestag_sheets as download_sheets
import bundestag.data.transform.bundestag_sheets as transform_sheets
import bundestag.data.utils as data_utils
import bundestag.logging as logging
import bundestag.paths as paths

## Setup

In [None]:
# from bundestag import html_parsing as hp

logger = logging.logger
logger.setLevel("DEBUG")

_paths = paths.get_paths("../data")
_paths

In [None]:
dry = True

In [None]:
_paths.make_raw_paths(dry=False)

## Collecting URIs for `.xlsx`/`.xls` documents from `.htm` files

`.xlsx` / `.xls` will be referred as "sheet" files.

In [None]:
# path = Path("../bundestag.de_data")
# html_path = path / "htm_files"
# sheet_path = path / "sheets"

In [None]:
html_path = _paths.raw_bundestag_html

In [None]:
html_file_paths = download_sheets.get_file_paths(
    html_path, pattern=download_sheets.RE_HTM
)
html_file_paths[:3]

In [None]:
sheet_uris = download_sheets.collect_sheet_uris(html_file_paths)
list(sheet_uris.items())[:3], list(sheet_uris.items())[-3:]

## Downloading sheet files

In [None]:
sheet_path = _paths.raw_bundestag_sheets

In [None]:
uri = sheet_uris[
    "10.09.2020: Abstrakte Normenkontrolle - Düngeverordnung (Beschlussempfehlung)"
]
download_sheets.download_sheet(uri, sheet_path=sheet_path, dry=dry)

In [None]:
download_sheets.download_multiple_sheets(
    sheet_uris, sheet_path=sheet_path, nmax=3, dry=dry
)

In [None]:
file_title_maps = transform_sheets.get_file2poll_maps(sheet_uris, sheet_path)
file_title_maps

## Loading sheets into DataFrames

Collecting the `xlsx` and `xls` file names

In [None]:
sheet_files = data_utils.get_file_paths(
    sheet_path, pattern=download_sheets.RE_FNAME
)
sheet_files

Reading files into dataframes

In [None]:
sheet_file = sheet_files[0]
sheet_file

In [None]:
df = transform_sheets.get_sheet_df(sheet_file, file_title_maps=file_title_maps)
df.head().T

Squishing vote columns

In [None]:
df_squished = transform_sheets.get_squished_dataframe(df)
df_squished.head(3).T

Setting some dtypes

In [None]:
df_squished = transform_sheets.set_sheet_dtypes(df_squished)

In [None]:
df_squished.head().T

Loading multiple sheets into dataframes

In [None]:
df = transform_sheets.get_multiple_sheets_df(
    sheet_files, file_title_maps=file_title_maps
)
df.head().T

Doing all the above

In [None]:
df = b_data.get_multiple_sheets(html_path, sheet_path, nmax=3, dry=dry)
df.head().T

Writing to disk

In [None]:
df_path = _paths.preprocessed_bundestag / "bundestag.de_votes.parquet"
logger.info(f"Writing to {df_path}")

In [None]:
_paths.make_preprocessed_paths(dry=False)

In [None]:
df.to_parquet(df_path)