In [1]:
import pandas as pd

df = pd.read_csv('./datasets/projectbenyehuda/benyehuda_nikud_dataset.csv')

In [9]:
from transformers import AutoTokenizer

# Load tokenizer once
TOKENIZER_NAME = "dicta-il/dictabert-large-char-menaked"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Display the first few rows of the dataframe
print(df.head())

# Show basic info about the dataframe
print(df.info())

# Show summary statistics for the length of text and article_length columns
df['text_length'] = df['text'].apply(len)
print(df[['text_length']].describe())

# Check for missing values
print(df.isnull().sum())

# Display the distribution of nikud_mask lengths
df['nikud_mask_length'] = df['nikud_mask'].apply(lambda x: len(x.split()))
print(df['nikud_mask_length'].value_counts().head())

      id           title                                               text  \
0  10026  איש הולך ברחוב  איש הולך ברחוב מאת רחל נגב אִישׁ הֹולֵךְ בָּרְ...   
1  10026  איש הולך ברחוב  אוּלַי אֵין לוֹ מִישֶׁהוּ לְדַבֵּר עִמּוֹ בַּב...   
2  10026  איש הולך ברחוב               וַדַּאי אֵין מִי שֶׁיְּדַבֵּר עִמּוֹ   
3  10026  איש הולך ברחוב                 אִישׁ הֹולֵךְ בָּרְחוֹב וּמְדַבֵּר   
4  10026  איש הולך ברחוב  אֶפְשָׁר לוֹמַר: אִישׁ הֹולֵךְ בָּרְחוֹב וּמְד...   

                                          nikud_mask  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, ...  
2  [0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, ...  
3  [0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, ...  
4  [0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55495 entries, 0 to 55494
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id

In [3]:
import re

NIKUD_PATTERN = re.compile(
    '['
    '\u05B0'  # sheva
    '\u05B1'  # hataf segol
    '\u05B2'  # hataf patah
    '\u05B3'  # hataf qamats
    '\u05B4'  # hiriq
    '\u05B5'  # tsere
    '\u05B6'  # segol
    '\u05B7'  # patah
    '\u05B8'  # qamats
    '\u05B9'  # holam
    '\u05BB'  # qubuts
    '\u05BC'  # dagesh or mapiq
    '\u05BD'  # meteg
    '\u05BF'  # rafe (rare)
    '\u05C1'  # shin dot
    '\u05C2'  # sin dot
    '\u05C7'  # qamats qatan
    ']'
)

In [4]:
# Check if each row contains at least one word with nikud using NIKUD_PATTERN
def has_word_with_nikud(text):
    return any(NIKUD_PATTERN.search(word) for word in text.split())

# Apply the function to the 'text' column and check if all rows satisfy the condition
all_have_nikud = df['text'].apply(has_word_with_nikud).all()
print("All rows contain at least one word with nikud:", all_have_nikud)

All rows contain at least one word with nikud: True


In [39]:
import ast

def find_next_space(tokens, index):
    """Find the next space after the given index in the token list."""
    for i in range(index, len(tokens)):
        if tokens[i] == " ":
            return i
    return None

def find_previous_space(tokens, index):
    """Find the previous space before the given index in the token list."""
    for i in range(index - 1, -1, -1):
        if tokens[i] == " ":
            return i
    return None

# Find rows where the nikud_mask contains only one to three 1s in a row (consecutive)


def has_1s_run(text, mask, min_run=1, max_run=3):
    mask_str = ''.join(str(x) for x in mask)
    tokens = tokenizer.tokenize(text)
    # Check for runs of 1s of length 1, 2, or 3
    for run_length in range(max_run, min_run - 1, -1):
        # print(f"Checking for run of {run_length} 1s in mask: {mask_str}")
        i = mask_str.find(f"{'1' * run_length}")
        if i != -1:
            prev = find_previous_space(tokens, i)
            prevprev = find_previous_space(tokens, prev) if prev is not None else None
            prevvalid = True
            if prevprev is not None:
                prevvalid = '1' not in mask_str[prevprev+1:prev]
            elif prev is not None:
                prevvalid = '1' not in mask_str[0:prev]
            next = find_next_space(tokens, i + run_length)
            nextnext = find_next_space(tokens, next + 1) if next is not None else None
            nextvalid = True
            if nextnext is not None:
                nextvalid = '1' not in mask_str[next+1:nextnext]
            elif next is not None:
                nextvalid = '1' not in mask_str[next+1:]
            # print(f"Found run of {run_length} 1s in mask: {mask_str} at position {i}")
            # Check if the run is isolated by at least 3 zeros
            # if (i == 0 or mask_str[i - 1] == '0') and (i + run_length == len(mask_str) or mask_str[i + run_length] == '0') and \
            #    (i <= 1 or mask_str[i - 2] == '0') and (i + run_length + 1 >= len(mask_str) or mask_str[i + run_length + 1] == '0'):
            if prevvalid and nextvalid:
                # print(f"Isolated run of {run_length} 1s in mask: {mask_str} at position {i}")
                # print(f"Previous space at {prev}, previous previous space at {prevprev}, next space at {next}, next next space at {nextnext}")
                return True
    return False

def column_filter(row):
    text = row['text']
    mask = ast.literal_eval(row['nikud_mask'])
    return has_1s_run(text, mask, min_run=1, max_run=3)

In [40]:
filtered_rows = df.sample(5)
filtered_rows.iloc[0]['nikud_mask']


'[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]'

In [41]:
filtered_rows = filtered_rows[filtered_rows.apply(column_filter, axis=1)]
filtered_rows

Unnamed: 0,id,title,text,nikud_mask,text_length,nikud_mask_length
55386,9155,אורח נוסח א,לבֶּן ותפוחי־אדמה – חשב בלבו – חלב חמוץ ותפוחי...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",59,59
2601,12048,אוב,החיים מקבלים יותר ויותר אופי של מחול־שכחה של ס...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",123,126


In [42]:
filtered_rows['text'].values[0]

'לבֶּן ותפוחי־אדמה – חשב בלבו – חלב חמוץ ותפוחי־אדמה מטוגנים'

In [43]:
# filtered_df = df[df['nikud_mask'].apply(lambda x: has_1s_run(ast.literal_eval(x)) if isinstance(x, str) else has_1s_run(x))]
filtered_df = df[df.apply(column_filter, axis=1)]
filtered_df.shape

(30388, 6)

In [None]:
sample_text = filtered_df.head(1).values[0][2]
sample_mask = filtered_df.head(1).values[0][3]
# has_1s_run(sample_text, ast.literal_eval(sample_mask))


True

In [45]:
filtered_df.head(5)

Unnamed: 0,id,title,text,nikud_mask,text_length,nikud_mask_length
11,1034,אחר חצות,"וברכּוּת נלבבת כל כך… אין זה גבר, נגיעה קלילה ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",70,72
12,1034,אחר חצות,"ולא דפיקתה הקדחתנית של אשה היא, הבאה בצל-הלילה...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",90,92
13,1034,אחר חצות,"אך אחותי שומרת את ערשׂ-חליה, ואמי שלי רחוקה, מ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",101,103
14,1034,אחר חצות,"ויודע אני, כי זו הדופקת ולבי שומע, תשמע גם ותא...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",76,77
15,1034,אחר חצות,שׂרה בת-טובים,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",13,14


In [46]:
filtered_df.to_csv('./datasets/projectbenyehuda/benyehuda_nikud_dataset_filtered.csv', index=False)