In [1]:
from transformers import pipeline
from transformers.tokenization_utils import TruncationStrategy

In [2]:
from IPython.core.display import display, HTML
def print_hebrew(text):
    display(HTML('<p dir="rtl">{}</p>'.format(text)))

In [3]:
MODEL_PATH = './model'

In [4]:
# HF unmasking pipe does not allow to mask subwords. this is a simple solution:
pipe = pipeline('fill-mask', MODEL_PATH)
def do_tokenize(inputs):
    return pipe.tokenizer(
            inputs,
            add_special_tokens=True,
            return_tensors=pipe.framework,
            padding=True,
            truncation=TruncationStrategy.DO_NOT_TRUNCATE,
        )

def _parse_and_tokenize(
    inputs, tokenized=False, **kwargs
):
    if not tokenized:
        inputs = do_tokenize(inputs)
    return inputs

pipe._parse_and_tokenize = _parse_and_tokenize

def unmask(query, top_k=5):
    nonspaces = query.replace(' ','')
    mask_idx = nonspaces.find('[MASK]')
    without_mask = query.replace('[MASK]', 'א') #replace with random char that will be replaced carfully with mask
    tokenized = do_tokenize(without_mask)
    tokenized['input_ids'][0][mask_idx + 1] = pipe.tokenizer.mask_token_id
    return pipe(tokenized, tokenized=True, top_k=top_k)
    

In [5]:
input_masked = 'שלום ע[MASK]לם'

In [6]:
print_hebrew(input_masked)

In [7]:
unmask(input_masked)

[{'sequence': 'שלום עולם',
  'score': 0.22049811482429504,
  'token': 518,
  'token_str': '##ו'},
 {'sequence': 'שלום עללם',
  'score': 0.08309558033943176,
  'token': 544,
  'token_str': '##ל'},
 {'sequence': 'שלום ע טלם',
  'score': 0.06925750523805618,
  'token': 296,
  'token_str': 'ט'},
 {'sequence': 'שלום עילם',
  'score': 0.03762314096093178,
  'token': 522,
  'token_str': '##י'},
 {'sequence': 'שלום ע זלם',
  'score': 0.03247128427028656,
  'token': 294,
  'token_str': 'ז'}]