# Import Required Libraries
Import torch and the necessary classes from the transformers library.

In [12]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load Tokenizer and Model
Load the AutoTokenizer and AutoModel for 'dicta-il/dictabert-large-char-menaked'. Set the model to evaluation mode.

In [13]:
model_name = "dicta-il/dictabert-large-char-menaked"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
model.eval()

BertForDiacritization(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(1024, 1024, padding_idx=0)
      (position_embeddings): Embedding(2048, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), e

In [16]:
def predict_with_probs(sentences, tokenizer, model, mark_matres_lectionis=None, top_k=5):
    # Use model.predict for decoded output
    decoded = model.predict(sentences, tokenizer, mark_matres_lectionis=mark_matres_lectionis)
    
    # Manual forward pass
    inputs = tokenizer(sentences, padding='longest', truncation=True,
                       return_tensors='pt', return_offsets_mapping=True)
    offset_mapping = inputs.pop('offset_mapping')
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.forward(**inputs, return_dict=True)
    logits = outputs.logits  # MenakedLogitsOutput
    nikud_logits = logits.nikud_logits  # [batch, seq_len, num_nikud]
    shin_logits = logits.shin_logits    # [batch, seq_len, num_shin]
    
    results = []
    for sent_idx, (sentence, offsets) in enumerate(zip(sentences, offset_mapping)):
        # For each character token, collect top-k probabilities
        sent_data = {'decoded': decoded[sent_idx], 'chars': []}
        probs = torch.softmax(nikud_logits[sent_idx], dim=-1)  # probabilities
        
        for i, (start, end) in enumerate(offsets):
            if end - start != 1:
                continue
            char = sentence[start:end]
            dist = probs[i]
            top_p, top_ids = torch.topk(dist, top_k)
            sent_data['chars'].append({
                'char': char,
                'predictions': {
                    model.config.nikud_classes[label_id.item()]: float(p.item())
                    for p, label_id in zip(top_p, top_ids)
                }
            })
        results.append(sent_data)
    return results


In [19]:
res = predict_with_probs(["ליירה"], tokenizer, model, top_k=3)
import pprint; pprint.pprint(res[0])

{'chars': [{'char': 'ל',
            'predictions': {'ֵ': 0.30501502752304077,
                            'ֶ': 0.4193100929260254,
                            'ַ': 0.14278148114681244}},
           {'char': 'י',
            'predictions': {'': 0.6885529160499573,
                            '<MAT_LECT>': 0.05325696989893913,
                            'ְ': 0.15395373106002808}},
           {'char': 'י',
            'predictions': {'': 0.13126079738140106,
                            '<MAT_LECT>': 0.8599182367324829,
                            'ְ': 0.003723365720361471}},
           {'char': 'ר',
            'predictions': {'': 0.05877650901675224,
                            'ֶ': 0.11981965601444244,
                            'ָ': 0.8134917616844177}},
           {'char': 'ה',
            'predictions': {'': 0.9990084767341614,
                            'ָ': 1.9796716514974833e-05,
                            'ּ': 0.0008896052022464573}}],
 'decoded': 'לֶירָה'}
