In [9]:
from wiktionary_de_parser.dump_processor import WiktionaryDump


dump = WiktionaryDump(dump_dir_path="tmp")
dump.download_dump()

In [2]:
from wiktionary_de_parser import WiktionaryParser

parser = WiktionaryParser()
pages = []

for page in dump.pages():
    if page.redirect_to:
        continue

    # if page.name != "ordo":
    #     continue

    pages.append(page)

In [None]:
from tqdm.notebook import tqdm
from notebooks.process_chunk import process_chunk
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp

from wiktionary_de_parser.models import MeaningDict


def chunks(lst, n):
    """Split list into n chunks"""
    chunk_size = len(lst) // n + (1 if len(lst) % n else 0)
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]


# Get number of CPU cores (leave one free for system)
n_cores = max(1, mp.cpu_count() - 1)

# Split pages into chunks
page_chunks = list(chunks(pages, n_cores))

all_lists: dict[str, list[list[MeaningDict]]] = {}
with ProcessPoolExecutor(max_workers=n_cores) as executor:
    # Process chunks in parallel with progress bar
    futures = list(
        tqdm(
            executor.map(process_chunk, page_chunks),
            total=len(page_chunks),
            desc="Processing chunks",
        )
    )

    # Merge results
    for result_dict in futures:
        all_lists.update(result_dict)


In [None]:
from wiktionary_de_parser.parser.parse_meanings import (
    format_meanings,
)


print(len(all_lists))

print(format_meanings(all_lists["ordo"]))

In [None]:
# recursively calculate the depth of the wiki_list.sublist
from wiktionary_de_parser.models import MeaningDict
from wiktionary_de_parser.parser.parse_meanings import format_meaning_dict


def calculate_depth(list_item: MeaningDict):
    if not list_item.get("sublist"):
        return 0
    return 1 + max(
        [calculate_depth(subitem) for subitem in list_item.get("sublist", [])]
    )


# Calculate the number of maximum list depth in all lists in all_lists
max_depth = 0
for word, lists in all_lists.items():
    for m_dict_list in lists:
        for m_dict in m_dict_list:
            depth = calculate_depth(m_dict)
            if depth > max_depth:
                max_depth = depth

            if depth == 3:
                print(f"word: {word}")
                print(format_meaning_dict(m_dict))
                print("\n")

print(max_depth)

In [25]:
import ipywidgets as widgets
from IPython.display import display, clear_output


def create_searchable_word_viewer(
    word_dict: dict[str, list[list[MeaningDict]]],
):
    # Statistik-Button und Output erstellen
    stats_button = widgets.Button(
        description="Statistiken anzeigen",
        style={"description_width": "initial"},
    )

    stats_output = widgets.Output(
        layout={
            "border": "1px solid black",
            "padding": "10px",
            "margin": "10px 0",
        }
    )

    def show_stats(b):
        with stats_output:
            clear_output()
            # Grundlegende Statistiken
            total_words = len(word_dict)
            # Berechne die Gesamtzahl der Bedeutungen für jedes Wort
            meanings_counts = [
                sum(len(entry_meanings) for entry_meanings in meanings_list)
                for meanings_list in word_dict.values()
            ]
            avg_meanings = (
                sum(meanings_counts) / total_words if total_words > 0 else 0
            )
            max_meanings = max(meanings_counts)

            print(f"Datensatz Statistiken:")
            print(f"• Anzahl Wörter: {total_words}")
            print(
                f"• Durchschnittliche Bedeutungen pro Wort: {avg_meanings:.2f}"
            )
            print(f"• Maximum Bedeutungen pro Wort: {max_meanings}")

            # Wörter mit den meisten Bedeutungen
            print("\nTop 5 Wörter mit meisten Bedeutungen:")
            sorted_words = sorted(
                word_dict.items(),
                key=lambda x: sum(len(m) for m in x[1]),
                reverse=True,
            )[:5]
            for word, meanings_list in sorted_words:
                total = sum(len(m) for m in meanings_list)
                print(f"• {word}: {total} Bedeutungen")

    stats_button.on_click(show_stats)

    # Suchfeld erstellen
    search_box = widgets.Text(
        description="Suche:",
        placeholder="Tippen Sie ein Wort...",
        style={"description_width": "initial"},
    )

    # Einzelwert-Slider statt Range-Slider
    meanings_slider = widgets.IntSlider(
        value=0,
        min=0,
        max=60,
        step=1,
        description="Exakte Anzahl:",
        style={"description_width": "initial"},
        continuous_update=False,
    )

    # Dropdown für gefilterte Wörter
    word_list = sorted(word_dict.keys())
    word_dropdown = widgets.Select(
        options=word_list,
        rows=10,
        description="Wörter:",
        style={"description_width": "initial"},
        layout={"width": "500px", "height": "200px"},  # Breite explizit setzen
    )

    output = widgets.Output(
        layout={
            "border": "1px solid black",
            "padding": "10px",
            "max_height": "400px",
            "max_width": "100%",
            "overflow": "auto",
            "white-space": "nowrap",
        }
    )

    def filter_words():
        search_term = search_box.value.lower()
        exact_meanings = meanings_slider.value

        filtered_words = [
            w
            for w in word_list
            if search_term in w.lower()
            and (
                exact_meanings == 0
                or sum(len(entry_meanings) for entry_meanings in word_dict[w])
                == exact_meanings
            )
        ]
        word_dropdown.options = filtered_words

        # Nur Anzahl aktualisieren, wenn kein Wort ausgewählt ist
        with output:
            if not word_dropdown.value:
                clear_output()
                print(f"Gefundene Wörter: {len(filtered_words)}")

    def on_word_select(change):
        with output:
            clear_output()
            print(f"Gefundene Wörter: {len(word_dropdown.options)}")
            if change.new:  # Wichtig: change.new statt change["new"]
                from IPython.display import HTML

                print("\nBedeutungen:")
                meanings_html = f'<div style="white-space: pre; font-family: monospace;">{format_meanings(word_dict[change.new])}</div>'
                display(HTML(meanings_html))

    # Event Handler neu registrieren
    search_box.observe(lambda _: filter_words(), names="value")
    meanings_slider.observe(lambda _: filter_words(), names="value")
    word_dropdown.observe(on_word_select, names="value")

    # Layout anpassen - Stats-Button und Output hinzufügen
    display(
        widgets.VBox(
            [
                stats_button,
                stats_output,
                search_box,
                meanings_slider,
                word_dropdown,
                output,
            ],
            layout=widgets.Layout(width="800px"),
        )
    )

    # Initial anzeigen
    filter_words()


# Viewer erstellen
create_searchable_word_viewer(all_lists)


VBox(children=(Button(description='Statistiken anzeigen', style=ButtonStyle()), Output(layout=Layout(border_bo…