In [94]:
import fitz  # PyMuPDF
import re
from collections import defaultdict
import pdfplumber
import pandas as pd

In [95]:
cantons = [
    "Aargau", "Appenzell Ausserrhoden", "Appenzell Innerrhoden", "Basel-Landschaft", "Basel-Stadt",
    "Bern", "Fribourg", "Geneva", "Glarus", "Graubünden", "Jura", "Lucerne", "Neuchâtel",
    "Nidwalden", "Obwalden", "Saint Gallen", "Schaffhausen", "Schwyz", "Solothurn", "Thurgau",
    "Ticino", "Uri", "Valais", "Vaud", "Zug", "Zurich", "Switzerland"
]

In [96]:
canton_pattern = re.compile(r'^(' + '|'.join(re.escape(c) for c in cantons) + r')\b')

In [97]:
def extract_data_units(text):
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    
    data = []
    i = 0
    while i + 2 < len(lines):
        title = lines[i]
        year = lines[i + 1]
        value = lines[i + 2]

        # Check if year looks like a valid year (4 digits or a range)
        if not re.match(r"^\d{4}$|^\d{4}/\d{2}$|^\d{4}-\d{4}$", year):
            i += 1  # Not a valid data block, skip one line
            continue

        # Add parsed result
        data.append({
            "title": title,
            "year": year,
            "value": value
        })

        i += 3  # Move to next block

    return data

In [98]:
# Iterate through the pages

def extract_file(file_year, start_page=None, end_page=None):
    doc = fitz.open(f"raw/{file_year}.pdf")
    if start_page is None:
        start_page = 2
    if end_page is None:
        end_page = 1

    df_list = []

    for page_num in range(start_page, len(doc) - end_page):
        text = doc.load_page(page_num).get_text()
        lines = text.strip().splitlines()
        for line in lines:
            if canton_pattern.match(line):
                # print(f"Found canton on page {page_num}: {line}")
                canton_name = line.split()[0]
                break
        canton_data = extract_data_units(text)
        canton_df = pd.DataFrame(canton_data)
        canton_df['canton'] = canton_name
        canton_df['file_year']  = file_year
        df_list.append(canton_df)

    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(df_list, ignore_index=True)
    return final_df


In [99]:
normal_files = [2012, 2013, 2019, 2020, 2021]

In [100]:
files_info = []
for file_year in normal_files:
    file_info = extract_file(file_year)
    files_info.append(file_info)
    print(f"Extracted data for {file_year} with {len(file_info)} records.")

Extracted data for 2012 with 1593 records.
Extracted data for 2013 with 1593 records.
Extracted data for 2019 with 1944 records.
Extracted data for 2020 with 1944 records.
Extracted data for 2021 with 1944 records.


In [101]:
files_info.append(extract_file(2014, start_page=5, end_page=5))

In [None]:
files_info.append(extract_file(2015, start_page=5, end_page=4))

In [None]:
files_info.append(extract_file(2016, start_page=5, end_page=6))

In [None]:
files_info.append(extract_file(2017, start_page=7, end_page=6))

In [None]:
files_info.append(extract_file(2018, start_page=7, end_page=5))

In [None]:
# into a single DataFrame
concated_df = pd.concat(files_info, ignore_index=True)
concated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18414 entries, 0 to 18413
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      18414 non-null  object
 1   year       18414 non-null  object
 2   value      18414 non-null  object
 3   canton     18414 non-null  object
 4   file_year  18414 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 719.4+ KB


In [None]:
# to tsv
concated_df.to_csv("processed/2012-2021.tsv", sep="\t", index=False)