# Import Required Libraries
Import torch and the necessary classes from the transformers library.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


# Load Tokenizer and Model
Load the AutoTokenizer and AutoModel for 'dicta-il/dictabert-large-char-menaked'. Set the model to evaluation mode.

In [2]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(
    'dicta-il/dictabert-large-char-menaked'
)
model = AutoModel.from_pretrained(
    'dicta-il/dictabert-large-char-menaked',
    trust_remote_code=True  # required for custom model logic
)

model.eval()  # set model to evaluation mode

BertForDiacritization(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(1024, 1024, padding_idx=0)
      (position_embeddings): Embedding(2048, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), e

# Prepare Input Sentences
Define a list of Hebrew sentences to use as input for the model.

In [3]:
# Prepare your input sentences (can be a list)
sentences = [
    "בשנת 1948 השלים אפרים קישון את לימודיו בפיסול מתכת ובתולדות האמנות והחל לפרסם מאמרים הומוריסטיים",
    "שלום עולם"
]

# Run Inference Without Matres Lectionis
Use the model's predict method on the input sentences without marking matres lectionis. Store the outputs.

In [4]:
# Run inference without matres lectionis
output_without = model.predict(sentences, tokenizer)

# Run Inference With Matres Lectionis Marked
Use the model's predict method on the input sentences, marking matres lectionis with '*'. Store the outputs.

In [5]:
# Run inference with matres lectionis marked with '*'
output_with = model.predict(sentences, tokenizer, mark_matres_lectionis='*')

# Display Model Outputs
Print the outputs for both inference runs for comparison.

In [6]:
print("Without matres lectionis:")
for out in output_without:
    print(out)

print("\nWith matres lectionis (marked):")
for out in output_with:
    print(out)

Without matres lectionis:
בִּשְׁנַת 1948 הִשְׁלִים אֶפְרַיִם קִישׁוֹן אֶת לִמּוּדָיו בְּפִסּוּל מַתֶּכֶת וּבְתוֹלְדוֹת הָאׇמָּנוּת וְהֵחֵל לְפַרְסֵם מַאֲמָרִים הוּמוֹרִיסְטִיִּים
שְׁלוֹם עוֹלָם

With matres lectionis (marked):
בִּשְׁנַת 1948 הִשְׁלִים אֶפְרַיִם קִישׁוֹן אֶת לִי*מּוּדָיו בְּפִי*סּוּל מַתֶּכֶת וּבְתוֹלְדוֹת הָאׇמָּנוּת וְהֵחֵל לְפַרְסֵם מַאֲמָרִים הוּמוֹרִיסְטִיִּים
שְׁלוֹם עוֹלָם


In [None]:
import re

NIKUD_PATTERN = re.compile(
    '['
    '\u05B0'  # sheva
    '\u05B1'  # hataf segol
    '\u05B2'  # hataf patah
    '\u05B3'  # hataf qamats
    '\u05B4'  # hiriq
    '\u05B5'  # tsere
    '\u05B6'  # segol
    '\u05B7'  # patah
    '\u05B8'  # qamats
    '\u05B9'  # holam
    '\u05BB'  # qubuts
    '\u05BC'  # dagesh or mapiq
    '\u05BD'  # meteg
    '\u05BF'  # rafe (rare)
    '\u05C1'  # shin dot
    '\u05C2'  # sin dot
    '\u05C7'  # qamats qatan
    ']'
)
def get_nikud_mask(text: str):
    """
    Returns a mask aligned with tokenizer input_ids,
    where 1 means the corresponding token span in the *original text*
    contained at least one nikud mark.
    """
    # Step 1: Tokenize with offsets
    enc = tokenizer(text, return_offsets_mapping=True)
    offsets = enc["offset_mapping"]

    # Step 2: For each token, check substring of original text
    mask = []
    for (start, end) in offsets:
        if start == end:  # special tokens
            mask.append(0)
            continue
        substring = text[start:end+1]
        print(f"'{substring}'", NIKUD_PATTERN.search(substring))
        mask.append(1 if NIKUD_PATTERN.search(substring) else 0)

    return enc["input_ids"], mask

In [17]:
_, mask = get_nikud_mask(output_with[0])
print(mask)

'בּ' <re.Match object; span=(1, 2), match='ּ'>
'שׁ' <re.Match object; span=(1, 2), match='ׁ'>
'נַ' <re.Match object; span=(1, 2), match='ַ'>
'ת ' None
' 1' None
'19' None
'94' None
'48' None
'8 ' None
' ה' None
'הִ' <re.Match object; span=(1, 2), match='ִ'>
'שׁ' <re.Match object; span=(1, 2), match='ׁ'>
'לִ' <re.Match object; span=(1, 2), match='ִ'>
'ים' None
'ם ' None
' א' None
'אֶ' <re.Match object; span=(1, 2), match='ֶ'>
'פְ' <re.Match object; span=(1, 2), match='ְ'>
'רַ' <re.Match object; span=(1, 2), match='ַ'>
'יִ' <re.Match object; span=(1, 2), match='ִ'>
'ם ' None
' ק' None
'קִ' <re.Match object; span=(1, 2), match='ִ'>
'יש' None
'שׁ' <re.Match object; span=(1, 2), match='ׁ'>
'וֹ' <re.Match object; span=(1, 2), match='ֹ'>
'ן ' None
' א' None
'אֶ' <re.Match object; span=(1, 2), match='ֶ'>
'ת ' None
' ל' None
'לִ' <re.Match object; span=(1, 2), match='ִ'>
'י*' None
'*מ' None
'מּ' <re.Match object; span=(1, 2), match='ּ'>
'וּ' <re.Match object; span=(1, 2), match='ּ'>
'דָ' <re.Ma