In [4]:
%pip install -q -U google-genai

Note: you may need to restart the kernel to use updated packages.


In [1]:
nikud_prompt = """
Add full Hebrew nikud (vowel marks) to the following text. 
Return only the text with nikud, no explanations. 
Text: """
explain_nikud_prompt = """
Add full Hebrew nikud (vowel marks) to the following text. 
Return only the text with nikud, no explanations. 
If a word has more than one possible nikud, mention the word at the end with no explanations.

Text: """

In [87]:
from google import genai
from google.genai import types
import os
# api_key = "AIzaSyCQNSUfcaY-XaOkaA18KuJgvzINv2Ty3FY"
api_key = "AIzaSyChO8fUi32t2eiJ9ap-S9dUNX8HhgmkeRo"

os.environ["GEMINI_API_KEY"] = api_key
client = genai.Client()

In [None]:
response=client.models.list(config={'page_size': 10})
print(response.page)

In [92]:
import os
import asyncio

async def generate_content_for_prompt(client: genai.Client, prompt: str, n_outputs: int, model_name: str) -> list[str]:
    """
    An asynchronous helper function to call the API for a single prompt 
    and extract the generated text.
    """
    while True:
        try:
            config = types.GenerateContentConfig(
                candidate_count=n_outputs,
                thinking_config=types.ThinkingConfig(thinking_budget=0)
            )
            
            # Use the asynchronous client module (client.aio)
            response = await client.aio.models.generate_content(
                model=model_name,
                contents=prompt,
                config=config,
            )
            
            # print(f"Completed request")
            
            # Extract the text from each candidate
            results = [
                candidate.content.parts[0].text 
                for candidate in response.candidates
            ]
            return results
        except Exception as e:
            if "429 RESOURCE_EXHAUSTED" in str(e):
                print(f"Quota exceeded {e}. Waiting for 30 seconds before retrying...")
                await asyncio.sleep(30)  # Wait before retrying
                continue  # Retry the request
            # print(f"Request failed with error: {e}")
            # Return error message for a failed request
            raise RuntimeError(f"Request failed with error: {e}")

async def run_async_batch_prompts(prompts: list[str], n_outputs: int = 1, model_name: str = 'gemini-2.0-flash') -> list[list[str]]:
    """
    Sends a batch of text prompts to the Gemini API asynchronously and returns 
    the results as a list of lists.

    Args:
        prompts: A list of string prompts to send to the model.
        n_outputs: The number of distinct outputs (candidates) to generate for each prompt.
        model_name: The model to use for the batch job.
        
    Returns:
        A list of lists, where each inner list contains the n_outputs 
        generated texts for the corresponding input prompt.
    """
    if not os.getenv("GEMINI_API_KEY"):
        print("Error: GEMINI_API_KEY environment variable is not set.")
        # Return a list of error lists for the expected structure
        return [["API_KEY_NOT_SET"] * n_outputs] * len(prompts)

    # Initialize the client (asyncio is managed by the client)
    client = genai.Client()
    print(f"Starting {len(prompts)} parallel requests, generating {n_outputs} candidates each...")

    # Create a list of all asynchronous tasks
    tasks = [
        generate_content_for_prompt(client, prompt, n_outputs, model_name)
        for prompt in prompts
    ]

    # Run all tasks concurrently and wait for them to complete
    results_list_of_lists = await asyncio.gather(*tasks)
    
    return results_list_of_lists

In [5]:

def call_gemini_api(content, temperature=0.7, max_output_tokens=1024, top_p=1.0):
    response = client.models.generate_content(
        model="gemini-2.5-flash", contents=content,
        config=types.GenerateContentConfig(
            thinking_config=types.ThinkingConfig(thinking_budget=0),  # Disables thinking
            temperature=temperature,  # Deterministic output
            max_output_tokens=max_output_tokens,  # Limit output length
            top_p=top_p,  # Nucleus sampling
        ),
    )
    return response.text

In [6]:
call_gemini_api(nikud_prompt + "ליירה הלכה לבית הספר")

'לֵיירָה הָלְכָה לְבֵית הַסֵּפֶר'

In [7]:
import pandas as pd

df = pd.read_csv('./datasets/hewiki/hebrew_nikud_dataset_filtered.csv')

In [8]:
import ast
import utils.hebrew_tokenizer as ht
from tqdm.notebook import tqdm

tqdm.pandas()  # Enable tqdm for pandas

def column_filter(row):
    text = row['text']
    mask = ast.literal_eval(row['nikud_mask'])
    return ht.has_1s_run(text, mask, min_run=1, max_run=3)

filtered_df = df[df.progress_apply(column_filter, axis=1)]

  0%|          | 0/81995 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2609 > 2048). Running this sequence through the model will result in indexing errors


In [9]:
filtered_df['nikud_word_mask'] = filtered_df.progress_apply(
    lambda row: ht.convert_token_to_word_mask(row['text'], row['nikud_mask']), axis=1)

  0%|          | 0/65533 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['nikud_word_mask'] = filtered_df.progress_apply(


In [11]:
filtered_df.to_csv('./datasets/hewiki/hebrew_nikud_dataset_filtered_word_mask.csv', index=False)

In [13]:
import math
from collections import Counter

def compute_word_entropy_mask(original_sentence: str, outputs: list[str], threshold: float = 0.0):
    """
    Compute entropy-based ambiguity mask for words in a sentence.
    
    Args:
        original_sentence (str): input sentence without nikud (used for tokenization).
        outputs (list[str]): list of generated outputs with nikud (multiple generations).
        threshold (float): entropy threshold above which a word is marked ambiguous.
                           Default = 0.0 (any disagreement → ambiguous).
    
    Returns:
        mask (list[int]): 0/1 mask, length = number of words in original_sentence.
                          1 = ambiguous (uncertain), 0 = consistent (confident).
        entropies (list[float]): entropy per word (in bits).
    """
    # Split original text into words (no nikud, for alignment)
    words = original_sentence.split()
    n_words = len(words)

    # Collect nikud-versions per word across outputs
    word_variants = [ [] for _ in range(n_words) ]
    for out in outputs:
        out_words = out.split()
        if len(out_words) != n_words:
            # simple alignment fallback: skip misaligned outputs
            continue
        for i, w in enumerate(out_words):
            word_variants[i].append(w)

    entropies = []
    mask = []
    for variants in word_variants:
        if not variants:
            entropies.append(0.0)
            mask.append(0)
            continue
        counts = Counter(variants)
        total = sum(counts.values())
        probs = [c/total for c in counts.values()]
        entropy = -sum(p * math.log2(p) for p in probs)
        entropies.append(entropy)
        mask.append(1 if entropy > threshold else 0)

    return mask, entropies


In [28]:
def strip_nikud(text: str) -> str:
    """
    Remove Hebrew nikud (vowel marks) from the input text.
    
    Args:
        text (str): Input Hebrew text with nikud.
    """
    return ht.NIKUD_PATTERN.sub('', text)

In [66]:
sample = filtered_df.sample(1000)
sample['text'].tolist()

['במרכז התוכנית ניצבו דמויותיהן של בובות כפפה שהופעלו על ידי בובנאים: שבי השבלול, אוּזָה האווזה, נולי האפרוחית ובץ הצב',
 'דֵאגוֹבָּה (Dagobah) הוא כוכב לכת ביצתי מחוץ למעטפת',
 'יתר על כן קונסטנטיוס שמר על תוארו כפונטיפקס מקסימוס כלומר הכהן העליון של הדת ברומא העתיקה מתוקף תואר זה הוא עמד בראש קולגיום הפונטיפקס (בלטינית: Pontifex או Pontifices), שהיה אחד מארבעת הקוֹלֵּגיא (collegia) הגדולים של הכהונה ברומא, והופקד על ניהול חיי הפולחן והדת של המדינה, לאחר מותו הוא זכה להיות מוכרז כאל על ידי הסנאט הרומאי',
 'סלעי טוף שמקורם באַנְדזיט נפוצים מאוד',
 'מובארכ (תעתיק מדויק: מֻבּארכּ, תעתיק נפוץ: מובארק, בערבית: مبارك) הוא שם פרטי ושם משפחה בערבית, שמשמעותו "מבורך"',
 'פנריטי (מיוונית: φανερός (פנרוס - "נִרְאֶה")) הוא כינוי לסלעי יסוד גסי גרגר, שניתן לראות את הגבישים הבונים אותם בעין בלתי מזוינת',
 'בנוסף, הוא החדיר חלקיקים אל מאיץ ה- ׁׁLarge Electron-Positron Collide או בקיצורו LEPׁ (מאיץ ההתנגשויות הגדול של אלקטרונים בפוזיטרונים) שם מואצים אלקטרונים ופוזיטרונים באנרגיה של 22 GeV',
 'בני אד

In [95]:
async def process_batch_chunks(sample_df: pd.DataFrame, nikud_prompt: str, chunk_size: int = 20, n_outputs: int = 5):
    """
    Asynchronous function to process prompts in chunks.
    
    Args:
        sample_df: The DataFrame containing the 'text' column to process.
        nikud_prompt: The prefix to add to each prompt.
        chunk_size: The number of prompts to send in one batch.
        n_outputs: The number of candidate responses to request per prompt.
    """
    # working_df = sample_df[sample_df['uncertainty_word_mask'].isnull()]
    # 1. Prepare all prompts
    prompt_data = list(zip(sample_df['text'].index, [nikud_prompt + strip_nikud(txt) for txt in sample_df['text'].tolist()]))
    sample_df['uncertainty_word_mask'] = pd.Series([None] * len(sample_df), index=sample_df.index, dtype='object')
    

    # total_chunks = math.ceil(len(prompt_data) / chunk_size)

    # Loop over the list of (index, prompt) tuples in chunks
    for i in range(0, len(prompt_data), chunk_size):
        # Get the chunk of (index, prompt) tuples
        chunk_data = prompt_data[i:i + chunk_size]
        
        # Separate the indices and the prompts for the batch call
        chunk_indices = [item[0] for item in chunk_data]
        chunk_prompts = [item[1] for item in chunk_data]
        
        # 1. Run the batch API call
        # The 'res' list will contain the generated content for the 'chunk_prompts' in order.
        res = await run_async_batch_prompts(chunk_prompts, n_outputs=n_outputs)
        
        # 2. Compute the mask for each item in the chunk
        # The list comprehension now iterates over the prompt data and the results simultaneously
        uncertainty_word_masks = [
            compute_word_entropy_mask(
                sample_df.loc[idx, 'text'],  # Get the original text using the actual index
                res[k],                   # Get the corresponding result from the batch response
                threshold=0.0
            )[0] 
            for k, idx in enumerate(chunk_indices)
        ]
        
        # This bypasses the bulk assignment logic that triggers the array error.
        for k, (idx, mask) in enumerate(zip(chunk_indices, uncertainty_word_masks)):
            if 1 not in mask:
                print(f"Warning: No uncertain words found for index {idx}. Mask: {mask}, Results: {res[k]}")
            # Assign the list/array mask directly to the specific cell using the index label
            sample_df.loc[idx, 'uncertainty_word_mask'] = str(mask)
        
        print(f"Processed chunk {i//chunk_size + 1} / {math.ceil(len(prompt_data)/chunk_size)}")
        
    # return sample_df

In [96]:
await process_batch_chunks(sample, nikud_prompt, chunk_size=10, n_outputs=5)

Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F8812B6D0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F850450C0>, 963591.468), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22920>, 963591.906), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22140>, 963592.14), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044CA0>, 963592.234), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046740>, 963592.531), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045960>, 963592.687), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045300>, 963592.765), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850468C0>, 963592.781), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F211E0>, 963593.312), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047400>, 963594.64)])']
connector: <aiohttp.connecto

Processed chunk 1 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F88128FA0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045F00>, 963595.89), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850461A0>, 963596.062), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22A40>, 963596.14), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044D60>, 963596.296), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045960>, 963596.328), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22E60>, 963596.375), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F21FC0>, 963596.75), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045840>, 963596.937), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850456C0>, 963596.937), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F220E0>, 963597.593)])']
connector: <aiohttp.connector

Processed chunk 2 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F8812AB00>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044D60>, 963598.718), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22C80>, 963598.906), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F21720>, 963598.921), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045F00>, 963598.921), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F21A20>, 963599.078), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045060>, 963599.453), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60340>, 963599.593), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045840>, 963599.656), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22920>, 963600.171), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850456C0>, 963600.343)])']
connector: <aiohttp.connec

Processed chunk 3 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F8946AA40>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F602E0>, 963601.296), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F21FC0>, 963601.546), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22B60>, 963601.687), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046140>, 963602.171), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60760>, 963602.203), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60940>, 963602.218), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F220E0>, 963602.453), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F612A0>, 963602.703), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850457E0>, 963602.765), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044D60>, 963603.156)])']
connector: <aiohttp.connec

Processed chunk 4 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F88128E80>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22AA0>, 963604.546), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F602E0>, 963604.687), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60940>, 963604.703), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60280>, 963604.875), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F211E0>, 963605.015), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60880>, 963605.109), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F61CC0>, 963605.265), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F61900>, 963605.328), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60460>, 963605.484), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F616C0>, 963606.546)])']
connector: <aiohttp.connec

Processed chunk 5 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F881282B0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F850458A0>, 963607.734), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046CE0>, 963608.562), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F15F60>, 963608.671), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60940>, 963608.765), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60520>, 963608.89), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60460>, 963608.953), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850471C0>, 963609.109), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F612A0>, 963609.14), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F142E0>, 963609.703), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850450C0>, 963609.875)])']
connector: <aiohttp.connecto

Processed chunk 6 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89442080>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F850458A0>, 963610.843), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044160>, 963611.14), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046BC0>, 963611.421), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F61CC0>, 963611.437), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045DE0>, 963611.609), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850472E0>, 963611.625), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047A60>, 963611.859), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046EC0>, 963612.046), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046740>, 963612.281), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044CA0>, 963612.828)])']
connector: <aiohttp.connect

Processed chunk 7 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F894411E0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046380>, 963613.531), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047A60>, 963614.375), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046CE0>, 963614.484), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044AC0>, 963614.515), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044E80>, 963614.703), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047040>, 963614.703), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044B80>, 963614.859), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045EA0>, 963615.14), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850468C0>, 963615.484), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850457E0>, 963615.562)])']
connector: <aiohttp.connect

Processed chunk 8 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89440E80>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044B80>, 963617.265), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046500>, 963617.562), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047100>, 963617.593), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047A60>, 963617.625), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045420>, 963617.703), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046EC0>, 963617.812), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850474C0>, 963617.953), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850472E0>, 963618.078), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F61CC0>, 963618.203), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046AA0>, 963618.968)])']
connector: <aiohttp.connec

Processed chunk 9 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89443280>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044B20>, 963620.39), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044FA0>, 963620.453), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046C80>, 963620.64), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60100>, 963620.781), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F600A0>, 963621.078), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84EA1DE0>, 963621.171), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F14280>, 963621.218), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044520>, 963621.515), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F62D40>, 963621.671), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84EA2980>, 963621.984)])']
connector: <aiohttp.connecto

Processed chunk 10 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89440490>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F14280>, 963623.25), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F14DC0>, 963623.593), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F14580>, 963623.812), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F154E0>, 963623.828), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F144C0>, 963623.828), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F16200>, 963623.906), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84EA2980>, 963624.359), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F171C0>, 963624.515), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84EA1DE0>, 963624.828), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F15360>, 963625.125)])']
connector: <aiohttp.connect

Processed chunk 11 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F894417E0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045240>, 963626.312), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E656C0>, 963626.609), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E64AC0>, 963627.062), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E66A40>, 963627.078), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850451E0>, 963627.187), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E665C0>, 963627.187), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84EA1DE0>, 963627.343), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E67BE0>, 963628.218), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044160>, 963628.406), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850463E0>, 963628.687)])']
connector: <aiohttp.connec

Processed chunk 12 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F894691E0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047400>, 963630.078), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046500>, 963630.125), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F14040>, 963630.25), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F158A0>, 963630.281), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F14AC0>, 963630.343), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F157E0>, 963630.656), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044520>, 963630.953), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F17BE0>, 963630.968), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F15720>, 963631.015), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F17220>, 963631.093)])']
connector: <aiohttp.connect

Processed chunk 13 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89440B80>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F20640>, 963632.171), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22E60>, 963632.218), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22DA0>, 963632.406), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22A40>, 963632.468), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F20BE0>, 963632.828), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E676A0>, 963632.859), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044520>, 963633.171), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22E00>, 963633.312), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22080>, 963633.671), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F228C0>, 963634.187)])']
connector: <aiohttp.connec

Processed chunk 14 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F894433A0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F228C0>, 963635.234), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F23BE0>, 963635.781), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F20040>, 963635.796), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22140>, 963635.828), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F20280>, 963635.843), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22A40>, 963636.484), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22E60>, 963636.843), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E65A80>, 963637.328), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22E00>, 963637.359), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F208E0>, 963637.515)])']
connector: <aiohttp.connec

Processed chunk 15 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F894421A0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F21A20>, 963638.687), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E65BA0>, 963638.765), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E65A80>, 963638.812), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0280>, 963639.0), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C19C0>, 963639.359), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C20E0>, 963639.656), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0E20>, 963639.687), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F23520>, 963639.781), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C1780>, 963639.859), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F20280>, 963639.875)])']
connector: <aiohttp.connecto

Processed chunk 16 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89443BE0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E65BA0>, 963641.0), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0E20>, 963641.0), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C1960>, 963641.265), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F841557E0>, 963641.328), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C17E0>, 963641.484), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C07C0>, 963641.656), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0640>, 963641.984), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416FA00>, 963642.312), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C19C0>, 963642.468), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416EEC0>, 963642.562)])']
connector: <aiohttp.connector.

Processed chunk 17 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F894427D0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C07C0>, 963643.437), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F21660>, 963644.031), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22680>, 963644.093), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E656C0>, 963644.218), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416F160>, 963644.515), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C1960>, 963644.609), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F20220>, 963644.734), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C19C0>, 963644.765), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F21A80>, 963644.843), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0460>, 963647.75)])']
connector: <aiohttp.connect

Processed chunk 18 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89442830>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0F40>, 963649.046), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C06A0>, 963649.171), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C1780>, 963649.234), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C2080>, 963649.234), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416FC40>, 963649.5), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22680>, 963649.718), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C20E0>, 963649.953), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416EDA0>, 963650.328), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F20E80>, 963651.203), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C1720>, 963651.437)])']
connector: <aiohttp.connecto

Processed chunk 19 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89441630>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E41AE0>, 963652.312), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C15A0>, 963652.906), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84EA0460>, 963652.937), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F208E0>, 963653.062), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E43B80>, 963653.062), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C04C0>, 963653.265), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22E60>, 963653.421), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E40D60>, 963653.781), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84EA00A0>, 963654.125), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416D300>, 963654.187)])']
connector: <aiohttp.connec

Processed chunk 20 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89443E80>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0A00>, 963655.328), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C1000>, 963655.703), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0B20>, 963655.968), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045660>, 963655.968), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8512BDC0>, 963656.031), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C06A0>, 963656.14), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8AAF6980>, 963656.375), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C20E0>, 963656.375), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045AE0>, 963656.5), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044700>, 963656.984)])']
connector: <aiohttp.connector

Processed chunk 21 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89440D90>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60520>, 963657.921), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60EE0>, 963658.218), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60460>, 963658.359), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60220>, 963658.531), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F609A0>, 963658.765), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60760>, 963659.14), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F62C20>, 963659.265), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F61000>, 963659.562), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84EA12A0>, 963660.078), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F612A0>, 963660.265)])']
connector: <aiohttp.connect

Processed chunk 22 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F894401F0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C1C00>, 963661.437), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0F40>, 963661.453), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C1960>, 963661.531), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F612A0>, 963661.765), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416E8C0>, 963661.843), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60460>, 963662.046), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416D300>, 963662.171), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416F0A0>, 963662.453), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416F160>, 963662.828), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C15A0>, 963663.406)])']
connector: <aiohttp.connec

Processed chunk 23 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F894422C0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F8416DA80>, 963664.39), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C15A0>, 963664.968), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046AA0>, 963665.031), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046680>, 963665.093), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60880>, 963665.218), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044C40>, 963665.25), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C2080>, 963665.343), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045A80>, 963666.0), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22440>, 963666.171), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0F40>, 963666.203)])']
connector: <aiohttp.connector.

Processed chunk 24 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F8812BDC0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F20220>, 963667.14), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F21840>, 963667.312), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047040>, 963667.703), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E67A00>, 963667.734), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850444C0>, 963668.031), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22440>, 963668.062), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044820>, 963668.265), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F23460>, 963668.562), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045660>, 963668.859), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C2080>, 963669.437)])']
connector: <aiohttp.connect

Processed chunk 25 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89440D00>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046D40>, 963670.343), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046740>, 963670.718), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22E00>, 963671.265), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850451E0>, 963671.281), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047100>, 963671.546), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850477C0>, 963671.593), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E64C40>, 963671.843), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045CC0>, 963672.015), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045120>, 963672.281), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850C0F40>, 963672.343)])']
connector: <aiohttp.connec

Processed chunk 26 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89442860>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F850451E0>, 963673.546), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60760>, 963673.546), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047100>, 963673.828), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045120>, 963673.968), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F16380>, 963674.093), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045AE0>, 963674.125), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60EE0>, 963674.14), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045660>, 963674.5), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046D40>, 963674.875), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F22E00>, 963676.234)])']
connector: <aiohttp.connector

Processed chunk 27 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F894416F0>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F850468C0>, 963677.578), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85044AC0>, 963677.765), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E66EC0>, 963677.781), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045120>, 963678.203), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85046740>, 963678.234), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84E65060>, 963678.859), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F23460>, 963678.937), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F84F60EE0>, 963678.984), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045660>, 963679.343), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047220>, 963681.64)])']
connector: <aiohttp.connect

Processed chunk 28 / 100
Starting 10 parallel requests, generating 5 candidates each...


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021F89440A60>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x0000021F8BC6C820>, 963682.656), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8BC6C760>, 963683.156), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047220>, 963683.453), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F850468C0>, 963683.468), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045AE0>, 963683.703), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85045120>, 963683.765), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8BC6FD60>, 963683.859), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8BC6F4C0>, 963684.968), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F8BC6C220>, 963685.406), (<aiohttp.client_proto.ResponseHandler object at 0x0000021F85047B20>, 963685.5)])']
connector: <aiohttp.connecto

Processed chunk 29 / 100
Starting 10 parallel requests, generating 5 candidates each...


RuntimeError: Request failed with error: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': 'API key expired. Please renew the API key.', 'status': 'INVALID_ARGUMENT', 'details': [{'@type': 'type.googleapis.com/google.rpc.ErrorInfo', 'reason': 'API_KEY_INVALID', 'domain': 'googleapis.com', 'metadata': {'service': 'generativelanguage.googleapis.com'}}, {'@type': 'type.googleapis.com/google.rpc.LocalizedMessage', 'locale': 'en-US', 'message': 'API key expired. Please renew the API key.'}]}}

In [None]:
sample.head()

In [61]:
# Compute the overlap ratio in a separate step
def overlap_ratio(row):
    uncertainty_mask = row['uncertainty_word_mask']
    if isinstance(uncertainty_mask, str):
        uncertainty_mask = ast.literal_eval(uncertainty_mask)
    nikud_mask = row['nikud_word_mask']
    if isinstance(nikud_mask, str):
        nikud_mask = ast.literal_eval(nikud_mask)
    overlap = sum(1 for u, n in zip(uncertainty_mask, nikud_mask) if u == 1 and n == 1)
    total_nikud = sum(nikud_mask)
    return overlap / total_nikud if total_nikud > 0 else None

sample['uncertainty_overlap_ratio'] = sample.apply(overlap_ratio, axis=1)
sample['uncertainty_overlap_ratio'].describe()

count    10.000000
mean      0.626667
std       0.456719
min       0.000000
25%       0.150000
50%       0.833333
75%       1.000000
max       1.000000
Name: uncertainty_overlap_ratio, dtype: float64

In [62]:
not_enough_overlap = sample[sample['uncertainty_overlap_ratio'] <= 0.5]
test = not_enough_overlap

In [52]:
def mask_str(mask):
    if isinstance(mask, str):
        mask = ast.literal_eval(mask)
    return "".join([str(x) for x in mask])

In [63]:
for i in range(len(test)):
    print(f"Example {i+1}:")
    print("Text:", test.iloc[i]['text'])
    print("Nikud Mask:      ", mask_str(test.iloc[i]['nikud_word_mask']))
    print("Uncertainty Mask:", mask_str(test.iloc[i]['uncertainty_word_mask']))
    print("Overlap Ratio:", test.iloc[i]['uncertainty_overlap_ratio'])
    print()

Example 1:
Text: על פי הספר "שורש היוחסין" (世本), סבו היה נסיך ממדינת צֶ'ן אשר שימש בממשלת מדינת לוּ, מולדתו של קונפוציוס
Nikud Mask:       0000000000100001000
Uncertainty Mask: 0111000001011100010
Overlap Ratio: 0.0

Example 2:
Text: בימיו חשבו שכל האוֹרְקִים כבר הושמדו, אך האוֹרְקִים הפתיעו אותו בשבילי ההרים הלבנים והרגו אותו ואת כל מלוויו
Nikud Mask:       000100010000000000
Uncertainty Mask: 000001000000000000
Overlap Ratio: 0.0

Example 3:
Text: המלך לואי ה-16 אישר ב-1774 את פיטוריו של פרמנטייה ועם זאת העניק לו מעמד גִמלאי של ארמון האינווליד
Nikud Mask:       000000000000001000
Uncertainty Mask: 011010000000000000
Overlap Ratio: 0.0



In [31]:
uncertainty_word_masks = [compute_word_entropy_mask(
    sample['text'].tolist()[i], res[i], threshold=0.0)[0] for i in range(len(sample))]
sample['uncertainty_word_mask'] = uncertainty_word_masks

for i in range(len(sample)):
    print(f"Original: {sample.iloc[i]['text']}")
    print(f"Outputs: {res[i]}")
    print(f"Nikud Mask: \t\t{sample['nikud_word_mask'].iloc[i]}")
    print(f"Uncertainty Mask: \t{sample['uncertainty_word_mask'].iloc[i]}")
    print()

Original: חברת אהרן סטרֶייט, שמוצריה משווקים תחת המותג סטריֶיט'ס (באנגלית: - Streit's) היא חברת מזון כשר אמריקאית שבסיסה בניו יורק סיטי
Outputs: ["חֶבְרַת אַהֲרֹן סְטְרַיְט, שֶׁמּוּצָרֶיהָ מְשֻׁוָּקִים תַּחַת הַמּוּתָג סְטְרַיְטְ'ס (בְּאַנְגְּלִית: - Streit's) הִיא חֶבְרַת מָזוֹן כָּשֵׁר אֲמֶרִיקָאִית שֶׁבְּסִיסָהּ בְּנִיוּ יוֹרְק סִיטִי", "חֶבְרַת אַהֲרֹן סְטְרֵיְט, שֶׁמֻּצָּרֶיהָ מְשֻׁוָּקִים תַּחַת הַמּוּתָג סְטְרֵיְטְס (בְּאַנְגְּלִית: - Streit's) הִיא חֶבְרַת מָזוֹן כָּשֵׁר אֲמֵרִיקָאִית שֶׁבְּסִיסָהּ בְּנִיוּ יוֹרְק סִיטִי", "חֶבְרַת אַהֲרֹן סְטְרֵיְט, שֶׁמֻּצָרֶיהָ מְשֻׁוָּקִים תַּחַת הַמּוּתָג סְטְרֵיְטְ'ס (בְּאַנְגְּלִית: - Streit's) הִיא חֶבְרַת מָזוֹן כָּשֵׁר אֲמֶרִיקָאִית שֶׁבְּסִיסָהּ בְּנִיּוּ יוֹרְק סִיטִי", 'חֶבְרַת אַהֲרֹן סְטְרֵיְט, שֶׁמּוּצָרֶיהָ מְשֻׁוָּקִים תַּחַת הַמּוּתָג סְטְרֵיְטְס (בְּאַנְגְּלִית: - סְטְרֵיְטְס) הִיא חֶבְרַת מָזוֹן כָּשֵׁר אֲמֵרִיקָאִית שֶׁבְּסִיסָהּ בְּנְיוּ יוֹרְק סִיטִי.', "חֶבְרַת אַהֲרֹן סְטְרֵייט, שֶׁמֻּצָּרֶיהָ מְשֻׁוָּקִים תַּחַת הַמּו