In [4]:
%pip install -q -U google-genai

Note: you may need to restart the kernel to use updated packages.


In [1]:
nikud_prompt = """
Add full Hebrew nikud (vowel marks) to the following text. 
Return only the text with nikud, no explanations. 
Text: """
explain_nikud_prompt = """
Add full Hebrew nikud (vowel marks) to the following text. 
Return only the text with nikud, no explanations. 
If a word has more than one possible nikud, mention the word at the end with no explanations.

Text: """

In [34]:
from google import genai
from google.genai import types
import dotenv

dotenv.load_dotenv(".env", override=True)
client = genai.Client()

In [33]:
response=client.models.list(config={'page_size': 10, 'query_base': True})
print(response.page)

[Model(
  description='Obtain a distributed representation of a text.',
  display_name='Embedding Gecko',
  input_token_limit=1024,
  name='models/embedding-gecko-001',
  output_token_limit=1,
  supported_actions=[
    'embedText',
    'countTextTokens',
  ],
  tuned_model_info=TunedModelInfo(),
  version='001'
), Model(
  description='Alias that points to the most recent production (non-experimental) release of Gemini 1.5 Pro, our mid-size multimodal model that supports up to 2 million tokens.',
  display_name='Gemini 1.5 Pro Latest',
  input_token_limit=2000000,
  name='models/gemini-1.5-pro-latest',
  output_token_limit=8192,
  supported_actions=[
    'generateContent',
    'countTokens',
  ],
  tuned_model_info=TunedModelInfo(),
  version='001'
), Model(
  description='Stable version of Gemini 1.5 Pro, our mid-size multimodal model that supports up to 2 million tokens, released in September of 2024.',
  display_name='Gemini 1.5 Pro 002',
  input_token_limit=2000000,
  name='models/

In [5]:
import os
import asyncio

async def generate_content_for_prompt(client: genai.Client, prompt: str, n_outputs: int, model_name: str) -> list[str]:
    """
    An asynchronous helper function to call the API for a single prompt 
    and extract the generated text.
    """
    while True:
        try:
            config = types.GenerateContentConfig(
                candidate_count=n_outputs,
                thinking_config=types.ThinkingConfig(thinking_budget=0)
            )
            
            # Use the asynchronous client module (client.aio)
            response = await client.aio.models.generate_content(
                model=model_name,
                contents=prompt,
                config=config,
            )
            
            # print(f"Completed request")
            
            # Extract the text from each candidate
            results = [
                candidate.content.parts[0].text 
                for candidate in response.candidates
            ]
            return results
        except Exception as e:
            if "429 RESOURCE_EXHAUSTED" in str(e):
                print(f"Quota exceeded {e}. Waiting for 30 seconds before retrying...")
                await asyncio.sleep(30)  # Wait before retrying
                continue  # Retry the request
            # print(f"Request failed with error: {e}")
            # Return error message for a failed request
            raise RuntimeError(f"Request failed with error: {e}")

async def run_async_batch_prompts(prompts: list[str], n_outputs: int = 1, model_name: str = 'gemini-2.0-flash') -> list[list[str]]:
    """
    Sends a batch of text prompts to the Gemini API asynchronously and returns 
    the results as a list of lists.

    Args:
        prompts: A list of string prompts to send to the model.
        n_outputs: The number of distinct outputs (candidates) to generate for each prompt.
        model_name: The model to use for the batch job.
        
    Returns:
        A list of lists, where each inner list contains the n_outputs 
        generated texts for the corresponding input prompt.
    """
    if not os.getenv("GEMINI_API_KEY"):
        print("Error: GEMINI_API_KEY environment variable is not set.")
        # Return a list of error lists for the expected structure
        return [["API_KEY_NOT_SET"] * n_outputs] * len(prompts)

    # Initialize the client (asyncio is managed by the client)
    client = genai.Client()
    print(f"Starting {len(prompts)} parallel requests, generating {n_outputs} candidates each...")

    # Create a list of all asynchronous tasks
    tasks = [
        generate_content_for_prompt(client, prompt, n_outputs, model_name)
        for prompt in prompts
    ]

    # Run all tasks concurrently and wait for them to complete
    results_list_of_lists = await asyncio.gather(*tasks)
    
    return results_list_of_lists

In [6]:

def call_gemini_api(content, temperature=0.7, max_output_tokens=1024, top_p=1.0):
    response = client.models.generate_content(
        model="gemini-2.5-flash", contents=content,
        config=types.GenerateContentConfig(
            thinking_config=types.ThinkingConfig(thinking_budget=0),  # Disables thinking
            temperature=temperature,  # Deterministic output
            max_output_tokens=max_output_tokens,  # Limit output length
            top_p=top_p,  # Nucleus sampling
        ),
    )
    return response.text

In [7]:
call_gemini_api(nikud_prompt + "ליירה הלכה לבית הספר")

'לַיְרָה הָלְכָה לְבֵית הַסֵּפֶר'

In [None]:
import pandas as pd

filtered_df = pd.read_csv('./datasets/hewiki/hebrew_nikud_dataset_filtered_word_mask.csv')

In [11]:
import math
from collections import Counter

def compute_word_entropy_mask(original_sentence: str, outputs: list[str], threshold: float = 0.0):
    """
    Compute entropy-based ambiguity mask for words in a sentence.
    
    Args:
        original_sentence (str): input sentence without nikud (used for tokenization).
        outputs (list[str]): list of generated outputs with nikud (multiple generations).
        threshold (float): entropy threshold above which a word is marked ambiguous.
                           Default = 0.0 (any disagreement → ambiguous).
    
    Returns:
        mask (list[int]): 0/1 mask, length = number of words in original_sentence.
                          1 = ambiguous (uncertain), 0 = consistent (confident).
        entropies (list[float]): entropy per word (in bits).
    """
    # Split original text into words (no nikud, for alignment)
    words = original_sentence.split()
    n_words = len(words)

    # Collect nikud-versions per word across outputs
    word_variants = [ [] for _ in range(n_words) ]
    for out in outputs:
        out_words = out.split()
        if len(out_words) != n_words:
            # simple alignment fallback: skip misaligned outputs
            continue
        for i, w in enumerate(out_words):
            word_variants[i].append(w)

    entropies = []
    mask = []
    for variants in word_variants:
        if not variants:
            entropies.append(0.0)
            mask.append(0)
            continue
        counts = Counter(variants)
        total = sum(counts.values())
        probs = [c/total for c in counts.values()]
        entropy = -sum(p * math.log2(p) for p in probs)
        entropies.append(entropy)
        mask.append(1 if entropy > threshold else 0)

    return mask, entropies


In [18]:
import utils.hebrew_tokenizer as ht

def strip_nikud(text: str) -> str:
    """
    Remove Hebrew nikud (vowel marks) from the input text.
    
    Args:
        text (str): Input Hebrew text with nikud.
    """
    return ht.NIKUD_PATTERN.sub('', text)

In [13]:
sample = filtered_df.sample(1000)
sample['text'].tolist()

["כולל השליטה במפרץ אילת בשארם א-שייח' ובאזור המֵצרים בראס נצרני, ששם היו מוצבים התותחים המצריים שחסמו את המעבר, איי טיראן וסנפיר ורוב החלק של מפרץ סואץ",
 'הם פוגשים ברכבת אדם שקט ומסתורי הידוע בשם מר אִם, אשר מציג להם מכשיר נייד קטן דמוי טלוויזיה בגודל של 15 ס"מ על 24 ס"מ',
 'במחזה הולדת שחר ושלם, הכולל בתחילתו דברי פתיחה ארוכים, וביניהם קריאה ל"לאלים נעימים, גוזרי ים, בני ים, יונקי שדי אשרה" (אלים אלה הם שחר ושלם), מוצגים אל ונשותיו, בשְׂדה אשרה ורחמי (היא כנראה ענת), על שפת הים',
 'מקור המילה "גמלאן" במילה "גָמֶל" בשפת יאווה, המתייחסת או לסוג הפטישון המשמש להקשה על הכלים או לפעולת ההקשה בפטישון',
 'בגילף כְּבּיר מצא כמאל א-דין אתרי אמנות סלע פרהיסטוריים לרוב',
 "ראשיד גוּמארוביץ' נורגאלייב (ברוסית: Рашид Гумарович Нургалиев; נולד ב-8 באוקטובר 1956, קזחסטן הסובייטית, ברית המועצות) הוא מדינאי רוסי",
 'במרץ 2021 השיקו האחיות אל ודקוטה פנינג חברת הפקות עצמאית בבעלותן, בשם "סרטי לוּאֵלֵן" (Lewellen Pictures)',
 'צְבָת (ברבים: צְבָתוֹת) הוא איבר בפרוקי רגליים, המאפשר להם לשאת משאות, להתג

In [14]:
async def process_batch_chunks(sample_df: pd.DataFrame, nikud_prompt: str, chunk_size: int = 20, n_outputs: int = 5):
    """
    Asynchronous function to process prompts in chunks.
    
    Args:
        sample_df: The DataFrame containing the 'text' column to process.
        nikud_prompt: The prefix to add to each prompt.
        chunk_size: The number of prompts to send in one batch.
        n_outputs: The number of candidate responses to request per prompt.
    """
    # working_df = sample_df[sample_df['uncertainty_word_mask'].isnull()]
    # 1. Prepare all prompts
    prompt_data = list(zip(sample_df['text'].index, [nikud_prompt + strip_nikud(txt) for txt in sample_df['text'].tolist()]))
    sample_df['uncertainty_word_mask'] = pd.Series([None] * len(sample_df), index=sample_df.index, dtype='object')
    

    # total_chunks = math.ceil(len(prompt_data) / chunk_size)

    # Loop over the list of (index, prompt) tuples in chunks
    for i in range(0, len(prompt_data), chunk_size):
        # Get the chunk of (index, prompt) tuples
        chunk_data = prompt_data[i:i + chunk_size]
        
        # Separate the indices and the prompts for the batch call
        chunk_indices = [item[0] for item in chunk_data]
        chunk_prompts = [item[1] for item in chunk_data]
        
        # 1. Run the batch API call
        # The 'res' list will contain the generated content for the 'chunk_prompts' in order.
        res = await run_async_batch_prompts(chunk_prompts, n_outputs=n_outputs)
        
        # 2. Compute the mask for each item in the chunk
        # The list comprehension now iterates over the prompt data and the results simultaneously
        uncertainty_word_masks = [
            compute_word_entropy_mask(
                sample_df.loc[idx, 'text'],  # Get the original text using the actual index
                res[k],                   # Get the corresponding result from the batch response
                threshold=0.0
            )[0] 
            for k, idx in enumerate(chunk_indices)
        ]
        
        # This bypasses the bulk assignment logic that triggers the array error.
        for k, (idx, mask) in enumerate(zip(chunk_indices, uncertainty_word_masks)):
            if 1 not in mask:
                print(f"Warning: No uncertain words found for index {idx}. Mask: {mask}, Results: {res[k]}")
            # Assign the list/array mask directly to the specific cell using the index label
            sample_df.loc[idx, 'uncertainty_word_mask'] = str(mask)
        
        print(f"Processed chunk {i//chunk_size + 1} / {math.ceil(len(prompt_data)/chunk_size)}")
        
    # return sample_df

In [None]:
await process_batch_chunks(sample, nikud_prompt, chunk_size=10, n_outputs=5)

In [16]:
sample.head()

Unnamed: 0,text,nikud_mask,article_title,article_length,text_length,nikud_mask_length,nikud_word_mask,uncertainty_word_mask
63883,כולל השליטה במפרץ אילת בשארם א-שייח' ובאזור המ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",הפלגת הרקולס,1953,150,151,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, ..."
57210,הם פוגשים ברכבת אדם שקט ומסתורי הידוע בשם מר א...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",מה אם...,294,117,118,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ..."
7359,"במחזה הולדת שחר ושלם, הכולל בתחילתו דברי פתיחה...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",אשרה (אלה),3986,207,207,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
39330,"מקור המילה ""גמלאן"" במילה ""גָמֶל"" בשפת יאווה, ה...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",גמלאן (מוזיקה),536,114,114,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
57601,בגילף כְּבּיר מצא כמאל א-דין אתרי אמנות סלע פר...,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...",כמאל א-דין חוסיין,705,60,59,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 0, 1, 1, 0, 1, 0, 1, 1]"


In [29]:
import ast

# Compute the overlap ratio in a separate step
def overlap_ratio(row):
    uncertainty_mask = row['uncertainty_word_mask']
    if isinstance(uncertainty_mask, str):
        uncertainty_mask = ast.literal_eval(uncertainty_mask)
    nikud_mask = row['nikud_word_mask']
    if isinstance(nikud_mask, str):
        nikud_mask = ast.literal_eval(nikud_mask)
    overlap = sum(1 for u, n in zip(uncertainty_mask, nikud_mask) if u == 1 and n == 1)
    total_nikud = sum(nikud_mask)
    return overlap / total_nikud if total_nikud > 0 else None

def opposite_overlap_ratio(row):
    uncertainty_mask = row['uncertainty_word_mask']
    if isinstance(uncertainty_mask, str):
        uncertainty_mask = ast.literal_eval(uncertainty_mask)
    nikud_mask = row['nikud_word_mask']
    if isinstance(nikud_mask, str):
        nikud_mask = ast.literal_eval(nikud_mask)
    overlap = sum(1 for u, n in zip(uncertainty_mask, nikud_mask) if u == 1 and n == 1)
    total_uncertainty = sum(uncertainty_mask)
    return overlap / total_uncertainty if total_uncertainty > 0 else None

In [None]:
sample['uncertainty_overlap_ratio'] = sample.apply(overlap_ratio, axis=1)
sample['uncertainty_overlap_ratio'].describe()

count    934.000000
mean       0.599596
std        0.474328
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: uncertainty_overlap_ratio, dtype: float64

In [30]:
sample['uncertainty_overlap_ratio_2'] = sample.apply(opposite_overlap_ratio, axis=1)
sample['uncertainty_overlap_ratio_2'].describe()

count    928.000000
mean       0.133938
std        0.183728
min        0.000000
25%        0.000000
50%        0.083333
75%        0.200000
max        1.000000
Name: uncertainty_overlap_ratio_2, dtype: float64

In [20]:
not_enough_overlap = sample[sample['uncertainty_overlap_ratio'] <= 0.5]
test = not_enough_overlap

In [19]:
import ast

def mask_str(mask):
    if isinstance(mask, str):
        mask = ast.literal_eval(mask)
    return "".join([str(x) for x in mask])

In [21]:
for i in range(len(test)):
    print(f"Example {i+1}:")
    print("Text:", test.iloc[i]['text'])
    print("Nikud Mask:      ", mask_str(test.iloc[i]['nikud_word_mask']))
    print("Uncertainty Mask:", mask_str(test.iloc[i]['uncertainty_word_mask']))
    print("Overlap Ratio:", test.iloc[i]['uncertainty_overlap_ratio'])
    print()

Example 1:
Text: כולל השליטה במפרץ אילת בשארם א-שייח' ובאזור המֵצרים בראס נצרני, ששם היו מוצבים התותחים המצריים שחסמו את המעבר, איי טיראן וסנפיר ורוב החלק של מפרץ סואץ
Nikud Mask:       00000001000000000000000000
Uncertainty Mask: 01001100111000000000100001
Overlap Ratio: 0.0

Example 2:
Text: במחזה הולדת שחר ושלם, הכולל בתחילתו דברי פתיחה ארוכים, וביניהם קריאה ל"לאלים נעימים, גוזרי ים, בני ים, יונקי שדי אשרה" (אלים אלה הם שחר ושלם), מוצגים אל ונשותיו, בשְׂדה אשרה ורחמי (היא כנראה ענת), על שפת הים
Nikud Mask:       0000000000000000000000000000100000000
Uncertainty Mask: 0101000010000000000000001001001000001
Overlap Ratio: 0.0

Example 3:
Text: הר הגעש המזרחי הוא שריד של הר געש קדום, שהתפתח לאורך בקע של מַגְמָה
Nikud Mask:       00000000000001
Uncertainty Mask: 00000000000000
Overlap Ratio: 0.0

Example 4:
Text: מַמְרִים קתולים (מלשון הַמְרָאָה, במובן של "סירוב, אי-ציות"; באנגלית: Popish recusants, recusancy) היה כינויים של הנוצרים הרומים-קתולים באנגליה, שנותרו נאמנים בגלוי לאמונתם לאחר

In [24]:
benyehuda_df = pd.read_csv('./datasets/projectbenyehuda/benyehuda_nikud_dataset_with_uncertainty_word_mask.csv')

In [25]:
benyehuda_df.head()

Unnamed: 0,id,title,text,nikud_mask,text_length,nikud_mask_length,uncertainty_mask,nikud_word_mask,uncertainty_word_mask
0,20937,מערה מול הים,“שֶׁלּוֹ,"[0, 0, 1, 1, 1, 0]",8,6,"[0, 0, 0, 1, 1, 0]",[1],[1]
1,55085,ריבה ויהודה סיפור חיים,"ואני זוכר שאני אז סיפרתי לה, “תראי, מהניסיון ש...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",304,305,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,32321,אדלשטיין נגד הזמן,היתה כאן איזו תגובה נגדית להסתייגות ההורים מן ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",185,184,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,12015,אהבתו של חייל משוחרר,"ואם אין מזדמנת לו עבודת-ארעי בריצוף, שהכנסה צד...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",255,256,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,28801,ימים של תכלת,"רגע נדמה לך, שיש בזה אמת פשוטה וטבעית, ורגע נד...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",81,82,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [26]:
by_sample = benyehuda_df.sample(1000)

In [None]:
await process_batch_chunks(by_sample, nikud_prompt, chunk_size=10, n_outputs=5)

In [28]:
by_sample['uncertainty_overlap_ratio'] = by_sample.apply(overlap_ratio, axis=1)
by_sample['uncertainty_overlap_ratio'].describe()

count    976.000000
mean       0.502172
std        0.464660
min        0.000000
25%        0.000000
50%        0.500000
75%        1.000000
max        1.000000
Name: uncertainty_overlap_ratio, dtype: float64

In [31]:
by_sample['uncertainty_overlap_ratio_2'] = by_sample.apply(opposite_overlap_ratio, axis=1)
by_sample['uncertainty_overlap_ratio_2'].describe()

count    884.000000
mean       0.188536
std        0.268157
min        0.000000
25%        0.000000
50%        0.090909
75%        0.250000
max        1.000000
Name: uncertainty_overlap_ratio_2, dtype: float64