# Haikus aus dem Nationalrat


protokolle laden und flatten

In [1]:
import json
import pandas as pd

with open("woswormeileistung/data/sessions.json") as f:
    sessions = json.load(f)

wortmeldungen = pd.json_normalize(
    sessions,
    record_path=["sections"],
    meta=["period", "sessionNumber", "date"],
)

wortmeldungen = wortmeldungen[["period", "sessionNumber", "date", "speaker", "text"]]

wortmeldungen.head()

Unnamed: 0,period,sessionNumber,date,speaker,text
0,XXVII,276,2024-09-18T00:00:00,88386,Präsident Mag. Wolfgang Sobotka: Meine sehr ge...
1,XXVII,276,2024-09-18T00:00:00,88386,Präsident Mag. Wolfgang Sobotka: Meine sehr ge...
2,XXVII,276,2024-09-18T00:00:00,88386,Präsident Mag. Wolfgang Sobotka: Der Herr Bund...
3,XXVII,276,2024-09-18T00:00:00,88386,Präsident Mag. Wolfgang Sobotka: Die Amtlichen...
4,XXVII,276,2024-09-18T00:00:00,88386,Präsident Mag. Wolfgang Sobotka: Ich darf beka...


sprecher:in label aus text entfernen

In [2]:
wortmeldungen["text"] = wortmeldungen["text"].str.split(": ", n=1).str[1]
wortmeldungen.head()

Unnamed: 0,period,sessionNumber,date,speaker,text
0,XXVII,276,2024-09-18T00:00:00,88386,Meine sehr geehrten Damen und Herren Abgeordne...
1,XXVII,276,2024-09-18T00:00:00,88386,Meine sehr geehrten Damen und Herren auf der G...
2,XXVII,276,2024-09-18T00:00:00,88386,Der Herr Bundespräsident hat mit Entschließung...
3,XXVII,276,2024-09-18T00:00:00,88386,Die Amtlichen Protokolle der 272. und der 273....
4,XXVII,276,2024-09-18T00:00:00,88386,"Ich darf bekannt geben, dass von der Bundeswah..."


dev mode: probelauf auf 1000 texte

In [16]:
wortmeldungen = wortmeldungen.sample(1_000)

haiku kandidaten finden

In [17]:
import spacy
import re
from tqdm import tqdm

!python -m spacy download de_core_news_sm

# Load a small German model
nlp = spacy.load("de_core_news_sm")  # or "xx_ent_wiki_sm" for multilingual

def filter_haiku_candidates(sentences):
    cleaned_sentences = []
    for s in sentences:
        s = s.strip()

        # Discard sentences ending with ! or ?
        if s.endswith("!") or s.endswith("?"):
            continue

        # Remove commas and periods
        s_no_comma = s.replace(",", "").replace(".", "")

        # Check only letters, spaces
        if not re.fullmatch(r"[A-Za-z ]+", s_no_comma):
            continue

        # Count characters without spaces
        char_count = len(s_no_comma.replace(" ", ""))
        if char_count < 30 or char_count > 100:
            continue

        cleaned_sentences.append(s_no_comma)

    return cleaned_sentences


haiku_candidates_list = []
for doc in tqdm(nlp.pipe(wortmeldungen["text"].fillna("").astype(str), batch_size=50), total=len(wortmeldungen)):
    sentences = [sent.text for sent in doc.sents]  # spaCy sentence splitting
    haiku_candidates_list.append(filter_haiku_candidates(sentences))

wortmeldungen["haiku_candidates"] = haiku_candidates_list
wortmeldungen.head()

Collecting de-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


100%|██████████| 1000/1000 [00:47<00:00, 21.21it/s]


Unnamed: 0,period,sessionNumber,date,speaker,text,haiku_candidates
89856,XXV,14,2014-02-25T00:00:00,59908,Sehr geehrter Herr Präsident! Sehr geehrte Dam...,[]
148948,XXIII,28,2007-07-05T00:00:00,16238,Frau Ministerin! Sie haben jetzt eine ganze Re...,[]
142452,XXIII,58,2008-05-07T00:00:00,1817,Nächster Redner ist Herr Abgeordneter Eßl. 4 M...,[]
124792,XXIV,60,2010-04-21T00:00:00,2834,Als nächste Rednerin zu Wort gelangt Frau Abge...,[]
180121,XXII,35,2003-10-23T00:00:00,799,"Zusatzfrage? – Herr Abgeordneter Fasslabend, b...",[]


echte haikus finden

todo in schritt oben integrieren weil eh sauschnell

In [21]:
import pyphen

# Initialize Pyphen for German
dic = pyphen.Pyphen(lang="de_DE")


def count_syllables(word):
    return dic.inserted(word).count("-") + 1


def split_haiku_lines(sentence):
    """
    Split sentence into 5-7-5 haiku lines using dashes.
    Returns sentence with '-' as line separators if valid, else None.
    """
    words = sentence.split()
    syllables_per_word = [count_syllables(w) for w in words]

    line_limits = [5, 7, 5]
    line_idx = 0
    line_sum = 0
    result = []

    for word, syl in zip(words, syllables_per_word):
        if line_idx >= 3:
            # Already completed 3 lines but words remain → invalid
            return None

        result.append(word)
        line_sum += syl

        if line_sum > line_limits[line_idx]:
            # Line exceeds target → invalid
            return None
        elif line_sum == line_limits[line_idx]:
            # End of line
            line_idx += 1
            line_sum = 0
            if line_idx < 3:
                result.append("-")  # insert dash between lines

    # Valid haiku if exactly 3 lines and no leftover syllables
    if line_idx == 3 and line_sum == 0:
        return " ".join(result)
    return None


def filter_and_split_haikus(haiku_candidates):
    return [s for s in (split_haiku_lines(c) for c in haiku_candidates) if s]


# Apply to DataFrame
wortmeldungen["haiku_575"] = wortmeldungen["haiku_candidates"].apply(
    filter_and_split_haikus
)

todo liste flatten auf haikus

In [22]:
#todo

todo exportieren

In [23]:
#todo