In [1]:
from wiktionary_de_parser.dump_processor import WiktionaryDump


dump = WiktionaryDump(dump_dir_path="tmp")
dump.download_dump()

In [2]:
from wiktionary_de_parser import WiktionaryParser

parser = WiktionaryParser()
pages = [page for page in dump.pages() if not page.redirect_to]

In [17]:
from wiktionary_de_parser import WiktionaryParser
from wiktionary_de_parser.parser.parse_meanings import format_meanings

parser = WiktionaryParser()
entries = []

for page in pages:
    if page.name != "koksen":
        # if page.name != "almen":
        # if page.name != "Waschbär":
        continue

    # found = False

    for entry in parser.entries_from_page(page):
        entry_parsed = parser.parse_entry(entry, include_meanings=True)
        if not entry_parsed.meanings:
            continue

        # entry meanings must have tags and raw tags
        # has_tags = any(
        #     [
        #         meaning
        #         for meaning in entry_parsed.meanings
        #         if meaning.get("tags") and meaning.get("raw_tags")
        #     ]
        # )
        # if not has_tags:
        #     continue

        # found = True
        print(page.name)
        print(format_meanings([entry_parsed.meanings]))

    # if found:
    break


koksen
• <Jargon> [umgangssprachlich] Kokain konsumieren
• [umgangssprachlich, scherzhaft] schlafen


In [18]:
from tqdm.notebook import tqdm
from notebooks.process_chunk import process_chunk
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp

from wiktionary_de_parser.models import MeaningDict


def chunks(lst, n):
    """Split list into n chunks"""
    chunk_size = len(lst) // n + (1 if len(lst) % n else 0)
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]


# Get number of CPU cores (leave one free for system)
n_cores = max(1, mp.cpu_count() - 1)

# Split pages into chunks
page_chunks = list(chunks(pages, n_cores))

all_lists: dict[str, list[list[MeaningDict]]] = {}
with ProcessPoolExecutor(max_workers=n_cores) as executor:
    # Process chunks in parallel with progress bar
    futures = list(
        tqdm(
            executor.map(process_chunk, page_chunks),
            total=len(page_chunks),
            desc="Processing chunks",
        )
    )

    # Merge results
    for result_dict in futures:
        all_lists.update(result_dict)


Processing chunks:   0%|          | 0/27 [00:00<?, ?it/s]

In [19]:
from wiktionary_de_parser.parser.parse_meanings import format_meanings


print(len(all_lists))

print(format_meanings(all_lists["ASA"]))

265290
• IOC-Ländercode von Amerikanisch-Samoa

Bedeutungen für den nächsten Eintrag:
• Advertising Standards Authority
• of a scale of film speeds American Standards Association
• <Kunststofftechnik> Abkürzung für Acrylnitril-Styrol-Acrylat


In [20]:
# recursively calculate the depth of the wiki_list.sublist
from wiktionary_de_parser.models import MeaningDict
from wiktionary_de_parser.parser.parse_meanings import format_meaning_dict


def calculate_depth(list_item: MeaningDict):
    if not list_item.get("sublist"):
        return 0
    return 1 + max(
        [calculate_depth(subitem) for subitem in list_item.get("sublist", [])]
    )


# Calculate the number of maximum list depth in all lists in all_lists
max_depth = 0
for word, lists in all_lists.items():
    for m_dict_list in lists:
        for m_dict in m_dict_list:
            depth = calculate_depth(m_dict)
            if depth > max_depth:
                max_depth = depth

            if depth == 3:
                print(f"word: {word}")
                print(format_meaning_dict(m_dict))
                print("\n")

print(max_depth)

word: condicional
• Der Kondicional der katalanischen Sprache drückt die Möglichkeit von Tatsachen oder Aktionen aus, die implizit oder explizit von Umständen in der Vergangenheit abhängig sind. Er wird in folgenden Situationen verwendet:
  • Zum Ausdruck der Wahrscheinlichkeit des Eintretens eines Ereignisses in der Zukunft:
    • Si tinguessim més temps, acabaríem el projecte.
      • Mit mehr Zeit würden wir das Projekt erfolgreich beenden.
  • Als Ausdruck einer höflichen Bitte (condicional de cortesia):
    • Tindries la bondat d’ / Podries ajudar-me en aquestes negociacions ?
      • Wärest du so freundlich, mich bei diesen Verhandlungen zu unterstützen?
  • Bei einer kaschierten oder vorsichtig formulierten Aufforderung:
    • Podríeu treballar més acuradament!
      • Ihr könntet sorgfältiger arbeiten!
  • Bei einem höflich, zurückhaltend formulierten Wunsch:
    • M’agradaria fer un creuer.
      • Ich möchte gern einmal eine Kreuzfahrt machen.
  • In höflich, zurückhaltend fo

In [21]:
import ipywidgets as widgets
from IPython.display import HTML
from IPython.display import display, clear_output


def create_searchable_word_viewer(
    word_dict: dict[str, list[list[MeaningDict]]],
):
    # Statistik-Button und Output erstellen
    stats_button = widgets.Button(
        description="Statistiken anzeigen",
        style={"description_width": "initial"},
    )

    stats_output = widgets.Output(
        layout={
            "border": "1px solid black",
            "padding": "10px",
            "margin": "10px 0",
        }
    )

    def show_stats(b):
        with stats_output:
            clear_output()
            # Grundlegende Statistiken
            total_words = len(word_dict)
            # Berechne die Gesamtzahl der Bedeutungen für jedes Wort
            meanings_counts = [
                sum(len(entry_meanings) for entry_meanings in meanings_list)
                for meanings_list in word_dict.values()
            ]
            avg_meanings = (
                sum(meanings_counts) / total_words if total_words > 0 else 0
            )
            max_meanings = max(meanings_counts)

            print(f"Datensatz Statistiken:")
            print(f"• Anzahl Wörter: {total_words}")
            print(
                f"• Durchschnittliche Bedeutungen pro Wort: {avg_meanings:.2f}"
            )
            print(f"• Maximum Bedeutungen pro Wort: {max_meanings}")

            # Wörter mit den meisten Bedeutungen
            print("\nTop 5 Wörter mit meisten Bedeutungen:")
            sorted_words = sorted(
                word_dict.items(),
                key=lambda x: sum(len(m) for m in x[1]),
                reverse=True,
            )[:5]
            for word, meanings_list in sorted_words:
                total = sum(len(m) for m in meanings_list)
                print(f"• {word}: {total} Bedeutungen")

    stats_button.on_click(show_stats)

    # Suchfeld erstellen
    search_box = widgets.Text(
        description="Suche:",
        placeholder="Tippen Sie ein Wort...",
        style={"description_width": "initial"},
    )

    # Einzelwert-Slider statt Range-Slider
    meanings_slider = widgets.IntSlider(
        value=0,
        min=0,
        max=60,
        step=1,
        description="Exakte Anzahl:",
        style={"description_width": "initial"},
        continuous_update=False,
    )

    # Dropdown für gefilterte Wörter
    word_list = sorted(word_dict.keys())
    word_dropdown = widgets.Select(
        options=word_list,
        rows=10,
        description="Wörter:",
        style={"description_width": "initial"},
        layout={"width": "500px", "height": "200px"},  # Breite explizit setzen
    )

    output = widgets.Output(
        layout={
            "border": "1px solid black",
            "padding": "10px",
            "max_height": "400px",
            "max_width": "100%",
            "overflow": "auto",
            "white-space": "nowrap",
        }
    )

    def filter_words():
        search_term = search_box.value.lower()
        exact_meanings = meanings_slider.value

        filtered_words = [
            w
            for w in word_list
            if search_term in w.lower()
            and (
                exact_meanings == 0
                or sum(len(entry_meanings) for entry_meanings in word_dict[w])
                == exact_meanings
            )
        ]
        word_dropdown.options = filtered_words

        # Nur Anzahl aktualisieren, wenn kein Wort ausgewählt ist
        with output:
            if not word_dropdown.value:
                clear_output()
                print(f"Gefundene Wörter: {len(filtered_words)}")

    def on_word_select(change):
        with output:
            clear_output()
            print(f"Gefundene Wörter: {len(word_dropdown.options)}")
            if change.new:  # Wichtig: change.new statt change["new"]
                print("\nBedeutungen:")
                print(format_meanings(word_dict[change.new]))

    # Event Handler neu registrieren
    search_box.observe(lambda _: filter_words(), names="value")
    meanings_slider.observe(lambda _: filter_words(), names="value")
    word_dropdown.observe(on_word_select, names="value")

    # Layout anpassen - Stats-Button und Output hinzufügen
    display(
        widgets.VBox(
            [
                stats_button,
                stats_output,
                search_box,
                meanings_slider,
                word_dropdown,
                output,
            ],
            layout=widgets.Layout(width="800px"),
        )
    )

    # Initial anzeigen
    filter_words()


# Viewer erstellen
create_searchable_word_viewer(all_lists)


VBox(children=(Button(description='Statistiken anzeigen', style=ButtonStyle()), Output(layout=Layout(border_bo…

In [22]:
from collections import Counter
from itertools import islice


def collect_all_tags(word_dict):
    all_tags = Counter()
    all_raw_tags = Counter()

    def process_meaning(meaning):
        if "tags" in meaning:
            for tag in meaning["tags"]:
                all_tags[tag] += 1
        if "raw_tags" in meaning:
            for tag in meaning["raw_tags"]:
                all_raw_tags[tag] += 1
        if "sublist" in meaning:
            for sub_meaning in meaning["sublist"]:
                process_meaning(sub_meaning)

    for word, meanings_list in word_dict.items():
        for entry_meanings in meanings_list:
            for meaning in entry_meanings:
                process_meaning(meaning)

    return all_tags, all_raw_tags


def find_words_with_tag(word_dict, tag, tag_type="tags"):
    matching_words = []

    def check_meaning(meaning, word):
        if tag_type in meaning and tag in meaning[tag_type]:
            matching_words.append(word)
            return True
        if "sublist" in meaning:
            for sub_meaning in meaning["sublist"]:
                if check_meaning(sub_meaning, word):
                    return True
        return False

    for word, meanings_list in word_dict.items():
        for entry_meanings in meanings_list:
            for meaning in entry_meanings:
                if check_meaning(meaning, word):
                    break

    return matching_words


def create_tag_viewer(word_dict):
    all_tags, all_raw_tags = collect_all_tags(word_dict)

    # Hauptcontainer
    stats_output = widgets.Output()
    tag_type_selector = widgets.RadioButtons(
        options=["Standard Tags", "Raw Tags"],
        description="Tag-Typ:",
        layout={"width": "max-content"},
    )
    search_box = widgets.Text(
        placeholder="Tag suchen...",
        description="Suche:",
        layout={"width": "400px"},
    )
    page_selector = widgets.IntText(
        value=1, description="Seite:", layout={"width": "200px"}
    )
    results_output = widgets.Output(
        layout={"height": "400px", "overflow": "auto"}
    )
    words_output = widgets.Output(
        layout={"height": "200px", "overflow": "auto"}
    )

    def update_stats():
        with stats_output:
            clear_output()
            current_tags = (
                all_raw_tags
                if tag_type_selector.value == "Raw Tags"
                else all_tags
            )
            print(f"Gesamtanzahl verschiedener Tags: {len(current_tags)}")
            print(f"Top 10 häufigste Tags:")
            for tag, count in current_tags.most_common(10):
                print(f"  • {tag}: {count}x")

    def show_matching_words(tag):
        with words_output:
            clear_output()
            tag_type = (
                "raw_tags" if tag_type_selector.value == "Raw Tags" else "tags"
            )
            matching = find_words_with_tag(word_dict, tag, tag_type)
            print(f"Wörter mit Tag '{tag}' (erste 100):")
            for word in sorted(matching)[:100]:
                print(f"  • {word}")

    def update_display(*args):
        with results_output:
            clear_output()
            current_tags = (
                all_raw_tags
                if tag_type_selector.value == "Raw Tags"
                else all_tags
            )
            search_term = search_box.value.lower()

            filtered_tags = [
                (tag, count)
                for tag, count in current_tags.items()
                if search_term in tag.lower()
            ]
            filtered_tags.sort(
                key=lambda x: (-x[1], x[0])
            )  # Sort by count (desc) and then alphabetically

            page = max(1, page_selector.value)
            start_idx = (page - 1) * 100
            page_tags = list(islice(filtered_tags, start_idx, start_idx + 100))

            print(
                f"Gefundene Tags: {len(filtered_tags)} (Seite {page} von {(len(filtered_tags)-1)//100 + 1})"
            )
            for tag, count in page_tags:
                print(f"  • {tag}: {count}x")

            if not page_tags:
                print("Keine Tags gefunden.")

    def on_tag_click(event):
        if event.new:
            # Extrahiere den Tag aus der Zeile (Format: "  • tag: countx")
            clicked_line = event.new
            if "•" in clicked_line:
                tag = clicked_line.split("•")[1].split(":")[0].strip()
                show_matching_words(tag)

    # Event Handler registrieren
    tag_type_selector.observe(
        lambda _: [update_stats(), update_display()], "value"
    )
    search_box.observe(update_display, "value")
    page_selector.observe(update_display, "value")
    results_output.observe(on_tag_click, "outputs")

    # Initial anzeigen
    update_stats()
    update_display()

    return widgets.VBox(
        [
            stats_output,
            tag_type_selector,
            widgets.HBox([search_box, page_selector]),
            widgets.Label(
                "Klicken Sie auf einen Tag, um die zugehörigen Wörter zu sehen:"
            ),
            results_output,
            words_output,
        ]
    )


# Viewer erstellen und anzeigen
display(create_tag_viewer(all_lists))


VBox(children=(Output(), RadioButtons(description='Tag-Typ:', layout=Layout(width='max-content'), options=('St…

In [23]:
# Sammle alle tags und raw_tags
all_tags, all_raw_tags = collect_all_tags(all_lists)

# Speichere tags in einer Textdatei, sortiert nach Häufigkeit
with open("tmp/tags.txt", "w") as f:
    for tag, count in sorted(
        all_tags.items(), key=lambda item: item[1], reverse=True
    ):
        f.write(f"{tag}: {count}\n")

# Speichere raw_tags in einer Textdatei, sortiert nach Häufigkeit
with open("tmp/raw_tags.txt", "w") as f:
    for tag, count in sorted(
        all_raw_tags.items(), key=lambda item: item[1], reverse=True
    ):
        f.write(f"{tag}: {count}\n")

In [24]:
from IPython.display import display
from wiktionary_de_parser.parser.parse_meanings import format_meanings


def show_tag_contents(
    tag_type: str, all_lists: dict[str, list[list[MeaningDict]]]
):
    # Sammle Wörter für jeden Tag
    word_by_tag = {}
    for word, lists in all_lists.items():
        for meanings_list in lists:
            for meaning in meanings_list:
                # Unterscheide zwischen tags und raw_tags
                if tag_type in meaning and meaning[tag_type]:
                    for tag in meaning[tag_type]:
                        if tag not in word_by_tag:
                            word_by_tag[tag] = set()
                        word_by_tag[tag].add(word)
                # Rekursiv durch Sublisten gehen
                if "sublist" in meaning:
                    for sub_meaning in meaning["sublist"]:
                        if tag_type in sub_meaning and sub_meaning[tag_type]:
                            for tag in sub_meaning[tag_type]:
                                if tag not in word_by_tag:
                                    word_by_tag[tag] = set()
                                word_by_tag[tag].add(word)

    # Create dropdown for tag selection
    sorted_tags = sorted(
        [(tag, words) for tag, words in word_by_tag.items() if len(words) > 1],
        key=lambda x: len(x[1]),
        reverse=True,
    )
    tag_options = [f"{tag} ({len(words)})" for tag, words in sorted_tags]

    tag_dropdown = widgets.Dropdown(
        options=tag_options,
        description="Tag:",
        style={"description_width": "initial"},
    )

    # Page navigation
    page_size = 5
    page_num = widgets.IntText(value=1, description="Page:", min=1)
    output = widgets.Output()

    def show_page(*args):
        output.clear_output()
        with output:
            if tag_dropdown.value:
                selected_tag = tag_dropdown.value.split(" (")[0]
                words = word_by_tag[selected_tag]
            else:
                print("No tag selected.")
                return

            # Calculate pagination
            start_idx = (page_num.value - 1) * page_size
            page_words = list(words)[start_idx : start_idx + page_size]
            max_pages = (len(words) + page_size - 1) // page_size

            # Update page number if out of bounds
            if page_num.value > max_pages:
                page_num.value = max_pages
            elif page_num.value < 1:
                page_num.value = 1

            # Show documents
            print(f"Showing page {page_num.value} of {max_pages}\n")
            for word in page_words:
                print(word)
                if word in all_lists:
                    print(format_meanings(all_lists[word]))
                print("-" * 80)

    # Connect events
    tag_dropdown.observe(show_page, "value")
    page_num.observe(show_page, "value")

    # Layout
    controls = widgets.HBox([tag_dropdown, page_num])
    display(widgets.VBox([controls, output]))

    # Initial display
    show_page()


# Create tabs for tags and raw_tags
tag_tab = widgets.Tab()
tag_output1 = widgets.Output()
tag_output2 = widgets.Output()

with tag_output1:
    show_tag_contents("tags", all_lists)
with tag_output2:
    show_tag_contents("raw_tags", all_lists)

tag_tab.children = [tag_output1, tag_output2]
tag_tab.set_title(0, f"Tags ({len(all_tags)})")
tag_tab.set_title(1, f"Raw Tags ({len(all_raw_tags)})")

display(tag_tab)


Tab(children=(Output(), Output()), selected_index=0, titles=('Tags (5844)', 'Raw Tags (15386)'))