# Import Required Libraries
Import torch and the necessary classes from the transformers library.

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


# Load Tokenizer and Model
Load the AutoTokenizer and AutoModel for 'dicta-il/dictabert-large-char-menaked'. Set the model to evaluation mode.

In [3]:
model_name = "dicta-il/dictabert-large-char-menaked"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
model.eval()

BertForDiacritization(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(1024, 1024, padding_idx=0)
      (position_embeddings): Embedding(2048, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), e

In [4]:
def predict_with_probs(sentences, tokenizer, model, mark_matres_lectionis=None, top_k=5):
    # Use model.predict for decoded output
    decoded = model.predict(sentences, tokenizer, mark_matres_lectionis=mark_matres_lectionis)
    
    # Manual forward pass
    inputs = tokenizer(sentences, padding='longest', truncation=True,
                       return_tensors='pt', return_offsets_mapping=True)
    offset_mapping = inputs.pop('offset_mapping')
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.forward(**inputs, return_dict=True)
    logits = outputs.logits  # MenakedLogitsOutput
    nikud_logits = logits.nikud_logits  # [batch, seq_len, num_nikud]
    shin_logits = logits.shin_logits    # [batch, seq_len, num_shin]
    
    results = []
    for sent_idx, (sentence, offsets) in enumerate(zip(sentences, offset_mapping)):
        # For each character token, collect top-k probabilities
        sent_data = {'decoded': decoded[sent_idx], 'chars': []}
        probs = torch.softmax(nikud_logits[sent_idx], dim=-1)  # probabilities
        
        for i, (start, end) in enumerate(offsets):
            if end - start != 1:
                continue
            char = sentence[start:end]
            dist = probs[i]
            top_p, top_ids = torch.topk(dist, top_k)
            sent_data['chars'].append({
                'char': char,
                'predictions': {
                    model.config.nikud_classes[label_id.item()]: float(p.item())
                    for p, label_id in zip(top_p, top_ids)
                }
            })
        results.append(sent_data)
    return results


In [5]:
res = predict_with_probs(["ליירה"], tokenizer, model, top_k=3)
import pprint; pprint.pprint(res[0])

{'chars': [{'char': 'ל',
            'predictions': {'ֵ': 0.30501502752304077,
                            'ֶ': 0.4193100929260254,
                            'ַ': 0.14278148114681244}},
           {'char': 'י',
            'predictions': {'': 0.6885529160499573,
                            '<MAT_LECT>': 0.05325696989893913,
                            'ְ': 0.15395373106002808}},
           {'char': 'י',
            'predictions': {'': 0.13126079738140106,
                            '<MAT_LECT>': 0.8599182367324829,
                            'ְ': 0.003723365720361471}},
           {'char': 'ר',
            'predictions': {'': 0.05877650901675224,
                            'ֶ': 0.11981965601444244,
                            'ָ': 0.8134917616844177}},
           {'char': 'ה',
            'predictions': {'': 0.9990084767341614,
                            'ָ': 1.9796716514974833e-05,
                            'ּ': 0.0008896052022464573}}],
 'decoded': 'לֶירָה'}


In [16]:
from math import log2

def nikud_uncertainty(text, model, tokenizer, 
                      top_k=5, 
                      entropy_threshold=1.0, 
                      margin_threshold=0.2, 
                      maxprob_threshold=0.7):
    """
    Identify ambiguous characters in the text according to nikud predictions.
    """
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, truncation=True)
    offsets = inputs.pop("offset_mapping")[0]

    with torch.no_grad():
        outputs = model(**inputs)   # MenakedOutput
        probs = torch.softmax(outputs.logits.nikud_logits[0], dim=-1)  # shape [seq_len, num_nikud_classes]

    id2label = model.config.nikud_classes  # list of all nikud symbols

    ambiguous = []
    for i, (start, end) in enumerate(offsets):
        if end - start != 1:  # skip special tokens / padding
            continue
        char = text[start:end]
        dist = probs[i]

        # Sort probabilities
        sorted_probs, sorted_ids = torch.sort(dist, descending=True)
        p1, p2 = sorted_probs[0].item(), sorted_probs[1].item()

        # --- criteria ---
        entropy = -sum(p.item() * log2(p.item()) for p in dist if p.item() > 0)
        margin = p1 - p2
        max_prob = p1

        # decide ambiguity
        is_ambig = (entropy > entropy_threshold) or (margin < margin_threshold) or (max_prob < maxprob_threshold)

        if is_ambig:
            ambiguous.append({
                "char": char,
                "position": (start, end),
                "entropy": entropy,
                "margin": margin,
                "max_prob": max_prob,
                "top_candidates": [
                    (id2label[sorted_ids[j].item()], sorted_probs[j].item())
                    for j in range(min(top_k, len(sorted_ids)))
                ]
            })
    return ambiguous

In [20]:
# Example usage
ambig = nikud_uncertainty("שלום עולם", model, tokenizer)
for a in ambig:
    print(a)
    print(a["position"][0])

{'char': ' ', 'position': (tensor(4), tensor(5)), 'entropy': 2.520866388614735, 'margin': 0.10133625566959381, 'max_prob': 0.3462553322315216, 'top_candidates': [('ָ', 0.3462553322315216), ('ֵ', 0.2449190765619278), ('', 0.2309339940547943), ('ִ', 0.0470949187874794), ('<MAT_LECT>', 0.0413050502538681)]}
tensor(4)


In [22]:
%pip install matplotlib seaborn

Collecting matplotlib
  Downloading matplotlib-3.10.6-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp310-cp310-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.59.2-cp310-cp310-win_amd64.whl.metadata (111 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.3.0-cp310-cp310-win_amd64.whl.metadata (9.2 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.6-cp310-cp310-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:-

In [46]:
from IPython.display import HTML, display
import matplotlib
import math

def colorize_text_by_certainty(text, model, tokenizer,
                               certainty_metric="max_prob",
                               scale="linear",
                               low_conf=0.6, high_conf=0.95):
    """
    Display text with characters color-coded by certainty.
    Uses a bright palette (yellow → orange → red) for better contrast on dark backgrounds.
    """
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, truncation=True)
    offsets = inputs.pop("offset_mapping")[0]

    with torch.no_grad():
        outputs = model(**inputs)
        nikud_probs = torch.softmax(outputs.logits.nikud_logits[0], dim=-1)

    html_chars = []

    for i, (start, end) in enumerate(offsets):
        if end - start != 1:
            continue
        char = text[start:end]
        dist = nikud_probs[i]

        # Certainty metric
        if certainty_metric == "max_prob":
            conf = dist.max().item()
        elif certainty_metric == "entropy":
            entropy = -sum(p.item() * math.log2(p.item()) for p in dist if p.item() > 0)
            conf = 1 - entropy / math.log2(len(dist))
        else:
            raise ValueError("Unknown certainty metric")

        # Normalize to [0,1]
        norm = (conf - low_conf) / (high_conf - low_conf)
        norm = min(max(norm, 0.0), 1.0)

        # Apply scaling
        if scale == "sqrt":
            norm = norm**0.5
        elif scale == "log":
            norm = (math.log1p(norm * 9) / math.log1p(9)) if norm > 0 else 0

        # Bright colormap (avoid dark colors)
        cmap = matplotlib.cm.get_cmap("YlOrRd")  # bright yellow → orange → red
        rgba = cmap(1 - norm)  # invert: high conf → yellow, low conf → red
        color = matplotlib.colors.rgb2hex(rgba)

        html_chars.append(f"<span style='color:{color}'>{char}</span>")

    display(HTML("".join(html_chars)))


# Example usage:
colorize_text_by_certainty("אלון האח חבר טוב של כדרלעומר", model, tokenizer,
                           certainty_metric="max_prob",
                           scale="sqrt", low_conf=0.6, high_conf=0.95)


  cmap = matplotlib.cm.get_cmap("YlOrRd")  # bright yellow → orange → red
