In [None]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
import time

start_page = 1
end_page = 608

# https://symbl.cc/en/unicode/table/#hebrew
def is_hebrew_cons(char):
    return '\u05D0' <= char <= '\u05EA'

def process_hebrew_string(hebrew_string):
    return tuple([c for c in hebrew_string if is_hebrew_cons(c)])

def scrape_hebrew_roots(base_url):
    options = Options()
    options.headless = True


    # Disable loading of images
    options.set_preference('permissions.default.image', 2)

    # Disable Google Analytics
    options.set_preference('network.cookie.cookieBehavior', 2)
    options.set_preference('browser.privatebrowsing.autostart', True)

    driver = webdriver.Firefox(options=options)

    table_data = []

    try:
        for pg in range(start_page, end_page + 1):
            url = f"{base_url}{pg}"
            driver.get(url)

            rows = driver.find_elements(By.XPATH, "//table[@class='table table-hover dict-table-t']//tr")

            for row in rows:
                # Extract text from all cells in the row
                row_data = [cell.text for cell in row.find_elements(By.XPATH, "./td")]

                if len(row_data) >= 2:
                    row_data[1] = process_hebrew_string(row_data[1])
                
                # Append the row data to the table_data list
                table_data.append(row_data)

            # print(pg)
            time.sleep(0.1)
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        driver.quit()

    return [ t for t in table_data if t]    

In [None]:
base_url = "https://www.pealim.com/dict/?pos=all&num-radicals=all&page="

hebrew_roots = scrape_hebrew_roots(base_url)

with open("hebrew_roots.json", "w", encoding="utf-8") as json_file:
    json.dump(hebrew_roots, json_file, ensure_ascii=False, indent=4)

In [None]:
with open('hebrew_roots.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

newdata = []
for r in data:
    splitword = r[0].splitlines()
    splitpos = r[2].split(' – ')
    newdata.append(splitword + [r[1]] + splitpos + r[3:])

print(len(data))

with open("hebrew_roots_postproc.json", "w", encoding="utf-8") as json_file:
    json.dump(newdata, json_file, ensure_ascii=False, indent=4)

In [2]:
import pandas as pd
import json

with open('hebrew_roots_postproc.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

df = pd.DataFrame(data)

column_labels = ["Word (he)", "Word (en)", "Root", "Part of speech", "Binyan", "Meaning"]
df.columns = column_labels

df.to_csv("../hebrew_dict/hebrew.csv", index=False)