In [2]:
# Import necessary libraries
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure plots display inline
%matplotlib inline

In [3]:
# Sefaria API Base URL
SEFARIA_API_BASE = "https://www.sefaria.org/api/texts/"

In [13]:
def fetch_text(book, chapter):
    url = f"https://www.sefaria.org/api/texts/{book}.{chapter}?context=0&lang=he"
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data for {book} {chapter}")
    
    data = response.json()
    # Convert the list of verses to a dictionary with verse numbers as keys
    return {i + 1: verse for i, verse in enumerate(data['he'])}


In [6]:
# Example: Fetch the first chapter of Genesis
genesis_ch1 = fetch_text("Genesis", 1)
print(genesis_ch1)

['<big>בְּ</big>רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃', 'וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְה֑וֹם וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמָּֽיִם׃', 'וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽיְהִי־אֽוֹר׃', 'וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב וַיַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָא֖וֹר וּבֵ֥ין הַחֹֽשֶׁךְ׃', 'וַיִּקְרָ֨א אֱלֹהִ֤ים&thinsp;<small>׀</small>&thinsp;לָאוֹר֙ י֔וֹם וְלַחֹ֖שֶׁךְ קָ֣רָא לָ֑יְלָה וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר י֥וֹם אֶחָֽד׃&nbsp;<span class="mam-spi-pe">{פ}</span><br>', 'וַיֹּ֣אמֶר אֱלֹהִ֔ים יְהִ֥י רָקִ֖יעַ בְּת֣וֹךְ הַמָּ֑יִם וִיהִ֣י מַבְדִּ֔יל בֵּ֥ין מַ֖יִם לָמָֽיִם׃', 'וַיַּ֣עַשׂ אֱלֹהִים֮ אֶת־הָרָקִ֒יעַ֒ וַיַּבְדֵּ֗ל בֵּ֤ין הַמַּ֙יִם֙ אֲשֶׁר֙ מִתַּ֣חַת לָרָקִ֔יעַ וּבֵ֣ין הַמַּ֔יִם אֲשֶׁ֖ר מֵעַ֣ל לָרָקִ֑יעַ וַֽיְהִי־כֵֽן׃', 'וַיִּקְרָ֧א אֱלֹהִ֛ים לָֽרָקִ֖יעַ שָׁמָ֑יִם וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר י֥וֹם שֵׁנִֽי׃&nbsp;<span class="mam-spi-pe">{פ}</span><br>', 'וַיֹּ֣אמֶר אֱלֹהִ֗ים יִקָּו֨וּ הַמַּ֜יִם מִתַּ

In [None]:
import re

def clean_text(verse, keep_vowels=True, keep_cantillation=False):
    """
    Cleans a single verse of Hebrew text.
    
    Args:
        verse (str): The verse text in Hebrew.
        keep_vowels (bool): If True, preserve vowel points; otherwise, remove them.
        keep_cantillation (bool): If True, preserve cantillation marks; otherwise, remove them.
    
    Returns:
        str: The cleaned Hebrew text.
    """
    # Remove HTML-like tags
    cleaned_text = re.sub(r"<[^>]+>", "", verse)
    
    if not keep_cantillation:
        # Remove cantillation marks (range: \u0591-\u05AF)
        cleaned_text = re.sub(r"[\u0591-\u05AF]", "", cleaned_text)
    
    if not keep_vowels:
        # Remove vowel points (range: \u05B0-\u05C7, excluding maqaf \u05BE)
        cleaned_text = re.sub(r"[\u05B0-\u05C7&&[^\u05BE]]", "", cleaned_text)
    
    # Replace maqaf (־) with a space if vowels are not being kept
    if not keep_vowels:
        cleaned_text = re.sub(r"\u05BE", " ", cleaned_text)
    else:
        # Preserve maqaf as-is if vowels are kept
        cleaned_text = cleaned_text.replace("\u05BE", "־")
    
    # Clean up extra spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
    
    return cleaned_text


  cleaned_text = re.sub(r"[\u0591-\u05C7&&[^\u05BE]]", "", cleaned_text)  # Remove all cantillation marks and vowels except maqaf


In [49]:
print(genesis_text_with_vowels[1][1], genesis_text_with_vowels[1][2])
print(genesis_text_without_vowels[1][1], genesis_text_without_vowels[1][2])

בְּרֵאשִׁית בָּרָא אֱלֹהִים אֵת הַשָּׁמַיִם וְאֵת הָאָֽרֶץ׃ וְהָאָרֶץ הָיְתָה תֹהוּ וָבֹהוּ וְחֹשֶׁךְ עַל־פְּנֵי תְהוֹם וְרוּחַ אֱלֹהִים מְרַחֶפֶת עַל־פְּנֵי הַמָּֽיִם׃
בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁךְ עַל פְּנֵ֣י תְה֑וֹם וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל פְּנֵ֥י הַמָּֽיִם׃


In [7]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 24.8 MB/s eta 0:00:00
Downloading regex-2024.11.6-cp313-cp313-win_amd64.whl (273 kB)
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, joblib, nltk
Successfully installed joblib-1.4.2 nltk-3.9.1 regex-2024.11.6 tqdm-4.67.1


In [30]:
genesis_text_no_vowels = fetch_book("Genesis")
print(genesis_text_no_vowels[1][1])  # Tokens without vowels


בְּרֵאשִית בָּרָא אֱלֹהִים אֵת הַשָּמַיִם וְאֵת הָאָֽרֶץ


In [31]:
genesis_text_with_vowels = fetch_book("Genesis", keep_vowels=True)
print(genesis_text_with_vowels[1][1])  # Tokens with vowels

בְּרֵאשִׁית בָּרָא אֱלֹהִים אֵת הַשָּׁמַיִם וְאֵת הָאָֽרֶץ׃


In [None]:
import re

# Define a nested structure for Tanakh text
tanakh_structure = {
    "Genesis": 50,  # 50 chapters
    "Exodus": 40,
    "Leviticus": 27,
    "Numbers": 36,
    "Deuteronomy": 34,
    # Add the rest of the books...
}

# Cleaning and tokenization function
def clean_and_tokenize(verse):
    # Remove HTML-like tags
    cleaned_text = re.sub(r"<[^>]+>", "", verse)
    
    # Remove diacritics (cantillation marks and vowel points)
    cleaned_text = re.sub(r"[\u0591-\u05C7]", "", cleaned_text)
    
    # Tokenize based on Hebrew characters only
    tokens = re.findall(r"[\u05D0-\u05EA]+", cleaned_text)
    return tokens

# Function to fetch all chapters of a book with cleaning and tokenization
def fetch_book(book):
    chapters = {}
    for chapter in range(1, tanakh_structure[book] + 1):
        text = fetch_text(book, chapter)  # Using the `fetch_text` function
        
        # Clean and tokenize each verse in the chapter
        cleaned_chapter = {
            verse_num: clean_and_tokenize(verse) 
            for verse_num, verse in text.items()
        }
        chapters[chapter] = cleaned_chapter
    return chapters

# Example: Fetch all of Genesis
genesis_text = fetch_book("Genesis")



In [8]:
import nltk
from nltk.tokenize import wordpunct_tokenize

# Tokenize a verse
def tokenize_verse(verse):
    return wordpunct_tokenize(verse)

# Example
sample_verse = genesis_text[1][0]  # First verse of Genesis
tokens = tokenize_verse(sample_verse)
print(tokens)

['<', 'big', '>', 'ב', 'ְּ</', 'big', '>', 'ר', 'ֵ', 'אש', 'ִׁ֖', 'ית', 'ב', 'ָּ', 'ר', 'ָ֣', 'א', 'א', 'ֱ', 'ל', 'ֹ', 'ה', 'ִ֑', 'ים', 'א', 'ֵ֥', 'ת', 'ה', 'ַ', 'ש', 'ָּׁ', 'מ', 'ַ֖', 'י', 'ִ', 'ם', 'ו', 'ְ', 'א', 'ֵ֥', 'ת', 'ה', 'ָ', 'א', 'ָֽ', 'ר', 'ֶ', 'ץ', '׃']
