In [1]:
import re
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Regex pattern to match Hebrew diacritics (some common nikud characters)
NIKUD_PATTERN = re.compile(
    '['
    '\u05B0'  # sheva
    '\u05B1'  # hataf segol
    '\u05B2'  # hataf patah
    '\u05B3'  # hataf qamats
    '\u05B4'  # hiriq
    '\u05B5'  # tsere
    '\u05B6'  # segol
    '\u05B7'  # patah
    '\u05B8'  # qamats
    '\u05B9'  # holam
    '\u05BB'  # qubuts
    '\u05BC'  # dagesh or mapiq
    '\u05BD'  # meteg
    '\u05BF'  # rafe (rare)
    '\u05C1'  # shin dot
    '\u05C2'  # sin dot
    '\u05C7'  # qamats qatan
    ']'
)

def has_nikud(text: str) -> bool:
    """Return True if text contains any Hebrew nikud diacritic."""
    if text is None:
        return False
    return bool(NIKUD_PATTERN.search(text))


In [None]:
# Load the dataset
ds = load_dataset("HeNLP/HeDC4", split="train")

# Inspect what the fields are
print(ds.column_names)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
# Suppose the field with Hebrew text is "text" (adjust if different)

# Filter rows
ds_with_nikud = ds.filter(lambda example: has_nikud(example['text']))

# Optionally: save to disk
ds_with_nikud.to_csv("HeDC4_with_nikud.csv", index=False)
# or Parquet
# ds_with_nikud.to_parquet("HeDC4_with_nikud.parquet")

print(f"Original size: {len(ds)}, Filtered size: {len(ds_with_nikud)}")