In [2]:
import re
from datetime import datetime
import pandas as pd
import pypdf
from pypdf import PdfReader
import glob

In [5]:
bundeslaender = ["Baden-Württemberg", "Bayern", "Berlin", "Brandenburg", "Bremen", "Hamburg", "Hessen", "Mecklenburg-Vorpommern",
                "Niedersachsen", "Nordrhein-Westfalen", "Rheinland-Pfalz", "Saarland", "Sachsen", "Sachsen-Anhalt", "Schleswig-Holstein", "Thüringen"]

pattern = r'(' + '|'.join(re.escape(s) for s in bundeslaender) + r') [\d(]'

def add_year(date, cmp, years):
    if int(date[3:5]) >= cmp:
        return date + str(years[0])
    return date + str(years[1])

def replace(string, *replace_targets, replace='a'):
    for replace_target in replace_targets:
        string = string.replace(replace_target, replace)
    return string

files = glob.glob("Daten\*.pdf")
files.sort()

results = []

for file in files:
    reader = PdfReader(file)
    number_of_pages = len(reader.pages)
    page = reader.pages[0]
    text = page.extract_text()
    years = re.findall("\n\d{4}", text)
    years = [int(year.strip("\n ")) for year in years]
    year = (min(years), min(years) + 1)

    for line in text.split("\n"):
        bundesland = re.findall(pattern, line)
        if len(bundesland) >= 1:
            bundesland = bundesland[0] # Select the matches Bundesland which is at the start of the line
            dates = [date for date in  re.findall("\d{2}\.\d{2}\. ?[-\+–] ?\d{2}\.\d{2}\." ,line)]
            dates = [replace(date, "+", "–", replace="-") for date in dates]
            dates = [date.replace(" ", "").split("-") for date in dates]

            # TODO: pull out solo dates as well

            # Pull out the month as an anchor for the date
            # Since we have years e.g. | 2000 - - - | 2001 - - - |
            # And all months that are in 2001 in this exmaple would be below the first month in 2000 since it covers a year
            # We save that first month. If the month is >= its the same year and if its < its the next year
            if not dates:
                continue

            comparison_month = int(dates[0][0][3:5])
            for date1, date2 in dates:
                date1 = add_year(date1, comparison_month, year)
                date2 = add_year(date2, comparison_month, year)
                date1 = datetime.strptime(date1, "%d.%m.%Y")
                date2 = datetime.strptime(date2, "%d.%m.%Y")
                results.append({
                    "Ferien Start": date1,
                    "Ferien Ende": date2,
                    "Bundesland": bundesland
                })

df = pd.DataFrame(results).sort_values(by=["Ferien Start"]).reset_index(drop=True)

In [6]:
df.to_csv("./ferien.csv", index=False)