In [1]:
!pip install requests pandas matplotlib seaborn docx python-docx

Collecting docx
  Downloading docx-0.2.4.tar.gz (54 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting lxml (from docx)
  Downloading lxml-5.3.0-cp313-cp313-win_amd64.whl.metadata (3.9 kB)
Collecting typing-extensions>=4.9.0 (from python-docx)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Downloading lxml-5.3.0-cp313-cp313-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   -- ------------------------------------- 0.3/3.8 MB ? eta -:--:--
   -------- ------------------------------- 0.


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import necessary libraries
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure plots display inline
%matplotlib inline

In [9]:
# Sefaria API Base URL
SEFARIA_API_BASE = "https://www.sefaria.org/api/texts/"

In [4]:
tanakh_structure = {
    # Torah
    "Genesis": 50,  # 50 chapters
    "Exodus": 40,
    "Leviticus": 27,
    "Numbers": 36,
    "Deuteronomy": 34,
    
    # Nevi'im (Prophets)
    "Joshua": 24,
    "Judges": 21,
    "1_Samuel": 31,
    "2_Samuel": 24,
    "1_Kings": 22,
    "2_Kings": 25,
    "Isaiah": 66,
    "Jeremiah": 52,
    "Ezekiel": 48,
    "Hosea": 14,
    "Joel": 4,
    "Amos": 9,
    "Obadiah": 1,
    "Jonah": 4,
    "Micah": 7,
    "Nahum": 3,
    "Habakkuk": 3,
    "Zephaniah": 3,
    "Haggai": 2,
    "Zechariah": 14,
    "Malachi": 3,

    # Ketuvim (Writings)
    "Psalms": 150,
    "Proverbs": 31,
    "Job": 42,
    "Song_of_Songs": 8,
    "Ruth": 4,
    "Lamentations": 5,
    "Ecclesiastes": 12,
    "Esther": 10,
    "Daniel": 12,
    "Ezra": 10,
    "Nehemiah": 13,
    "1_Chronicles": 29,
    "2_Chronicles": 36
}


In [3]:
import re
import html 
import requests 
from docx import Document

def clean_text(verse, keep_vowels=False, keep_cantillation=False):
    """
    Cleans a single verse of Hebrew text.
    
    Args:
        verse (str): The verse text in Hebrew.
        keep_vowels (bool): If True, preserve vowel points; otherwise, remove them.
        keep_cantillation (bool): If True, preserve cantillation marks; otherwise, remove them.
    
    Returns:
        str: The cleaned Hebrew text.
    """
    # Remove HTML-like tags 
    cleaned_text = re.sub(r"<[^>]+>", "", verse) 
    
    # Replace HTML entities with their corresponding characters 
    cleaned_text = html.unescape(cleaned_text) 
    
    # Remove any remaining curly braces and their contents 
    
    cleaned_text = re.sub(r"{[^}]+}", "", cleaned_text)
    
    # Adjusting the treatment of the maqaf
    if not keep_vowels or not keep_cantillation:
        cleaned_text = cleaned_text.replace("\u05BE", " ")  # Replace maqaf (־) with a space if either vowels or cantillation are removed
    else:
        cleaned_text = cleaned_text.replace("\u05BE", "־")  # Keep maqaf as-is otherwise


    if not keep_cantillation:
        # Remove cantillation marks (range: \u0591-\u05AF)
        cleaned_text = re.sub(r"[\u0591-\u05AF]", "", cleaned_text)
    
    if not keep_vowels:
        # Remove vowel points (range: \u05B0-\u05C7), except for maqaf (U+05BE)
        cleaned_text = re.sub(r"[\u05B0-\u05C7]", "", cleaned_text)  # Remove all vowels


    # Clean up any extra spaces created
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
    
    return cleaned_text

def fetch_text(book, chapter):
    url = f"https://www.sefaria.org/api/texts/{book}.{chapter}?context=0&lang=he"
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data for {book} {chapter}")
    
    data = response.json()
    # Convert the list of verses to a dictionary with verse numbers as keys
    return {i + 1: verse for i, verse in enumerate(data['he'])}
def fetch_book(book, keep_vowels=False, keep_cantillation=False):
    chapters = {}
    for chapter in range(1, tanakh_structure[book] + 1):
        text = fetch_text(book, chapter)
        cleaned_chapter = {
            verse_num: clean_text(verse, keep_vowels=keep_vowels, keep_cantillation=keep_cantillation)
            for verse_num, verse in text.items()
        }
        chapters[chapter] = cleaned_chapter
    return chapters



In [None]:
# Example: Fetch all of Genesis with different options
# Remove both vowels and cantillation
genesis_text_without_both = fetch_book("Genesis", keep_vowels=False, keep_cantillation=False)

# Preserve vowels but remove cantillation
genesis_text_with_vowels = fetch_book("Genesis", keep_vowels=True, keep_cantillation=False)

# Preserve both vowels and cantillation
genesis_text_full = fetch_book("Genesis", keep_vowels=True, keep_cantillation=True)

# Preserve cantillation but remove vowels
genesis_text_with_cantillation = fetch_book("Genesis", keep_vowels=False, keep_cantillation=True)

# Print the first two verses of Genesis for all versions
print("With vowels:")
print(genesis_text_with_vowels[1][1], genesis_text_with_vowels[1][2])

print("\nWithout both (vowels and cantillation):")
print(genesis_text_without_both[1][1], genesis_text_without_both[1][2])

print("\nWith cantillation (no vowels):")
print(genesis_text_with_cantillation[1][1], genesis_text_with_cantillation[1][2])

print("\nWith both vowels and cantillation:")
print(genesis_text_full[1][1], genesis_text_full[1][2])

In [None]:
def tokenize(text):
    """
    Tokenizes the cleaned text into individual words, ensuring the maqef is properly handled.
    
    Args:
        text (str): The cleaned Hebrew text.
    
    Returns:
        list: A list of words (tokens).
    """
    # Add a space after maqef (־)
    text = text.replace('־', '־ ')
    # Split text into words
    tokens = text.split()
    return tokens


def fetch_and_tokenize_book(book, keep_vowels=False, keep_cantillation=False):
    chapters = fetch_book(book, keep_vowels=keep_vowels, keep_cantillation=keep_cantillation)
    tokenized_chapters = {
        chapter: {verse_num: tokenize(verse) for verse_num, verse in verses.items()}
        for chapter, verses in chapters.items()
    }
    return tokenized_chapters

In [None]:
import sqlite3

def create_database():
    with sqlite3.connect('tanakh.db') as conn:
        cursor = conn.cursor()

        # Drop existing tables if they exist
        cursor.execute('DROP TABLE IF EXISTS Books')
        cursor.execute('DROP TABLE IF EXISTS Chapters')
        cursor.execute('DROP TABLE IF EXISTS Verses')
        cursor.execute('DROP TABLE IF EXISTS Words')

        # Create tables
        cursor.execute('''
        CREATE TABLE Books (
            id INTEGER PRIMARY KEY,
            name TEXT
        )
        ''')

        cursor.execute('''
        CREATE TABLE Chapters (
            id INTEGER PRIMARY KEY,
            book_id INTEGER,
            number INTEGER,
            parsha_name TEXT,
            FOREIGN KEY(book_id) REFERENCES Books(id)
        )
        ''')

        cursor.execute('''
        CREATE TABLE Verses (
            id INTEGER PRIMARY KEY,
            chapter_id INTEGER,
            number INTEGER,
            FOREIGN KEY(chapter_id) REFERENCES Chapters(id)
        )
        ''')

        cursor.execute('''
        CREATE TABLE Words (
            id INTEGER PRIMARY KEY,
            verse_id INTEGER,
            "order" INTEGER,
            word_without_both TEXT,
            word_with_vowels TEXT,
            word_with_cantillation TEXT,
            word_full TEXT,
            parsha_name TEXT,
            FOREIGN KEY(verse_id) REFERENCES Verses(id)
        )
        ''')

        print("Database created successfully!")



parsha_ranges = {
    "Bereshit": ("Genesis", (1, 1), (6, 8)),
    "Noach": ("Genesis", (6, 9), (11, 32)),
    "Lech-Lecha": ("Genesis", (12, 1), (17, 27)),
    "Vayeira": ("Genesis", (18, 1), (22, 24)),
    "Chayei Sarah": ("Genesis", (23, 1), (25, 18)),
    "Toldot": ("Genesis", (25, 19), (28, 9)),
    "Vayetzei": ("Genesis", (28, 10), (32, 3)),
    "Vayishlach": ("Genesis", (32, 4), (36, 43)),
    "Vayeshev": ("Genesis", (37, 1), (40, 23)),
    "Miketz": ("Genesis", (41, 1), (44, 17)),
    "Vayigash": ("Genesis", (44, 18), (47, 27)),
    "Vayechi": ("Genesis", (47, 28), (50, 26)),
    "Shemot": ("Exodus", (1, 1), (6, 1)),
    "Va'eira": ("Exodus", (6, 2), (9, 35)),
    "Bo": ("Exodus", (10, 1), (13, 16)),
    "Beshalach": ("Exodus", (13, 17), (17, 16)),
    "Yitro": ("Exodus", (18, 1), (20, 23)),
    "Mishpatim": ("Exodus", (21, 1), (24, 18)),
    "Terumah": ("Exodus", (25, 1), (27, 19)),
    "Tetzaveh": ("Exodus", (27, 20), (30, 10)),
    "Ki Tisa": ("Exodus", (30, 11), (34, 35)),
    "Vayakhel": ("Exodus", (35, 1), (38, 20)),
    "Pekudei": ("Exodus", (38, 21), (40, 38)),
    "Vayikra": ("Leviticus", (1, 1), (5, 26)),
    "Tzav": ("Leviticus", (6, 1), (8, 36)),
    "Shemini": ("Leviticus", (9, 1), (11, 47)),
    "Tazria": ("Leviticus", (12, 1), (13, 59)),
    "Metzora": ("Leviticus", (14, 1), (15, 33)),
    "Acharei Mot": ("Leviticus", (16, 1), (18, 30)),
    "Kedoshim": ("Leviticus", (19, 1), (20, 27)),
    "Emor": ("Leviticus", (21, 1), (24, 23)),
    "Behar": ("Leviticus", (25, 1), (26, 2)),
    "Bechukotai": ("Leviticus", (26, 3), (27, 34)),
    "Bamidbar": ("Numbers", (1, 1), (4, 20)),
    "Naso": ("Numbers", (4, 21), (7, 89)),
    "Beha'alotcha": ("Numbers", (8, 1), (12, 16)),
    "Shelach": ("Numbers", (13, 1), (15, 41)),
    "Korach": ("Numbers", (16, 1), (18, 32)),
    "Chukat": ("Numbers", (19, 1), (22, 1)),
    "Balak": ("Numbers", (22, 2), (25, 9)),
    "Pinchas": ("Numbers", (25, 10), (30, 1)),
    "Matot": ("Numbers", (30, 2), (32, 42)),
    "Masei": ("Numbers", (33, 1), (36, 13)),
    "Devarim": ("Deuteronomy", (1, 1), (3, 22)),
    "Va'etchanan": ("Deuteronomy", (3, 23), (7, 11)),
    "Eikev": ("Deuteronomy", (7, 12), (11, 25)),
    "Re'eh": ("Deuteronomy", (11, 26), (16, 17)),
    "Shoftim": ("Deuteronomy", (16, 18), (21, 9)),
    "Ki Teitzei": ("Deuteronomy", (21, 10), (25, 19)),
    "Ki Tavo": ("Deuteronomy", (26, 1), (29, 8)),
    "Nitzavim": ("Deuteronomy", (29, 9), (30, 20)),
    "Vayelech": ("Deuteronomy", (31, 1), (31, 30)),
    "Ha'azinu": ("Deuteronomy", (32, 1), (32, 52)),
    "V'Zot HaBerachah": ("Deuteronomy", (33, 1), (34, 12))
}

def get_parsha_chapters_verses(parsha_name):
    return parsha_ranges.get(parsha_name)

def populate_database(book_name):
    with sqlite3.connect('tanakh.db') as conn:
        cursor = conn.cursor()

        print("Inserting book...")
        # Insert book
        cursor.execute('INSERT INTO Books (name) VALUES (?)', (book_name,))
        book_id = cursor.lastrowid
        print(f"Book ID: {book_id}")

        print("Fetching and tokenizing book text...")
        # Fetch and tokenize book text
        tokenized_text_without_both = fetch_and_tokenize_book(book_name, keep_vowels=False, keep_cantillation=False)
        tokenized_text_with_vowels = fetch_and_tokenize_book(book_name, keep_vowels=True, keep_cantillation=False)
        tokenized_text_full = fetch_and_tokenize_book(book_name, keep_vowels=True, keep_cantillation=True)
        tokenized_text_with_cantillation = fetch_and_tokenize_book(book_name, keep_vowels=False, keep_cantillation=True)

        for chapter_num, verses in tokenized_text_without_both.items():
            print(f"Inserting chapter {chapter_num}...")
            cursor.execute('INSERT INTO Chapters (book_id, number) VALUES (?, ?)', (book_id, chapter_num))
            chapter_id = cursor.lastrowid

            for verse_num, tokens in verses.items():
                print(f"Inserting verse {verse_num} in chapter {chapter_num}...")
                cursor.execute('INSERT INTO Verses (chapter_id, number) VALUES (?, ?)', (chapter_id, verse_num))
                verse_id = cursor.lastrowid

                # Determine the parsha name for this chapter and verse
                current_parsha = None
                for parsha_name, (book, start, end) in parsha_ranges.items():
                    if book == book_name and (start[0] <= chapter_num <= end[0]) and \
                    ((chapter_num > start[0]) or (start[0] == chapter_num and verse_num >= start[1])) and \
                    ((chapter_num < end[0]) or (end[0] == chapter_num and verse_num <= end[1])):
                        current_parsha = parsha_name
                        print(f"Parsha for chapter {chapter_num}, verse {verse_num} is {current_parsha}")
                        break
                
                for order, word_without_both in enumerate(tokens, start=1):
                    try:
                        word_with_vowels = tokenized_text_with_vowels[chapter_num][verse_num][order-1]
                    except IndexError:
                        word_with_vowels = None  # Handle missing word

                    try:
                        word_full = tokenized_text_full[chapter_num][verse_num][order-1]
                    except IndexError:
                        word_full = None  # Handle missing word

                    try:
                        word_with_cantillation = tokenized_text_with_cantillation[chapter_num][verse_num][order-1]
                    except IndexError:
                        word_with_cantillation = None  # Handle missing word

                    cursor.execute('''
                    INSERT INTO Words (verse_id, "order", word_without_both, word_with_vowels, word_with_cantillation, word_full, parsha_name)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                    ''', (verse_id, order, word_without_both, word_with_vowels, word_with_cantillation, word_full, current_parsha))
                    print(f"Inserted word {order} in verse {verse_num}, chapter {chapter_num}, parsha {current_parsha}")

    print("Database populated successfully!")

Database created successfully!


In [5]:
import sqlite3

def check_database():
    conn = sqlite3.connect('tanakh.db')
    cursor = conn.cursor()

    # Check the Books table
    cursor.execute('SELECT * FROM Books')
    books = cursor.fetchall()
    print("Books table:")
    for book in books:
        print(book)

    # Check the Chapters table
    cursor.execute('SELECT * FROM Chapters')
    chapters = cursor.fetchall()
    print("\nChapters table:")
    for chapter in chapters:
        print(chapter)

    # Check the Verses table
    cursor.execute('SELECT * FROM Verses')
    verses = cursor.fetchall()
    print("\nVerses table:")
    for verse in verses:
        print(verse)

    # Check the Words table
    cursor.execute('SELECT * FROM Words LIMIT 100')  # Limiting to 100 rows for brevity
    words = cursor.fetchall()
    print("\nWords table:")
    for word in words:
        print(word)

    conn.close()

# Run the check
check_database()

Books table:
(1, 'Genesis')
(2, 'Exodus')
(3, 'Leviticus')
(4, 'Numbers')
(5, 'Deuteronomy')

Chapters table:
(1, 1, 1, None)
(2, 1, 2, None)
(3, 1, 3, None)
(4, 1, 4, None)
(5, 1, 5, None)
(6, 1, 6, None)
(7, 1, 7, None)
(8, 1, 8, None)
(9, 1, 9, None)
(10, 1, 10, None)
(11, 1, 11, None)
(12, 1, 12, None)
(13, 1, 13, None)
(14, 1, 14, None)
(15, 1, 15, None)
(16, 1, 16, None)
(17, 1, 17, None)
(18, 1, 18, None)
(19, 1, 19, None)
(20, 1, 20, None)
(21, 1, 21, None)
(22, 1, 22, None)
(23, 1, 23, None)
(24, 1, 24, None)
(25, 1, 25, None)
(26, 1, 26, None)
(27, 1, 27, None)
(28, 1, 28, None)
(29, 1, 29, None)
(30, 1, 30, None)
(31, 1, 31, None)
(32, 1, 32, None)
(33, 1, 33, None)
(34, 1, 34, None)
(35, 1, 35, None)
(36, 1, 36, None)
(37, 1, 37, None)
(38, 1, 38, None)
(39, 1, 39, None)
(40, 1, 40, None)
(41, 1, 41, None)
(42, 1, 42, None)
(43, 1, 43, None)
(44, 1, 44, None)
(45, 1, 45, None)
(46, 1, 46, None)
(47, 1, 47, None)
(48, 1, 48, None)
(49, 1, 49, None)
(50, 1, 50, None)
(51, 2, 

In [15]:
import sqlite3

# Assume previous functions like create_database, fetch_and_tokenize_book, etc., are already defined

torah_books = ["Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy"]

def populate_torah_books():
    with sqlite3.connect('tanakh.db') as conn:
        cursor = conn.cursor()
        
        for book_name in torah_books:
            print(f"Populating database for the book of {book_name}...")
            populate_database(book_name)
            print(f"Database populated successfully for {book_name}!")

populate_torah_books()

# Check the database after populating the Torah books
check_database()


Populating database for the book of Genesis...
Inserting book...
Book ID: 1
Fetching and tokenizing book text...
Inserting chapter 1...
Inserting verse 1 in chapter 1...
Parsha for chapter 1, verse 1 is Bereshit
Inserted word 1 in verse 1, chapter 1, parsha Bereshit
Inserted word 2 in verse 1, chapter 1, parsha Bereshit
Inserted word 3 in verse 1, chapter 1, parsha Bereshit
Inserted word 4 in verse 1, chapter 1, parsha Bereshit
Inserted word 5 in verse 1, chapter 1, parsha Bereshit
Inserted word 6 in verse 1, chapter 1, parsha Bereshit
Inserted word 7 in verse 1, chapter 1, parsha Bereshit
Inserting verse 2 in chapter 1...
Parsha for chapter 1, verse 2 is Bereshit
Inserted word 1 in verse 2, chapter 1, parsha Bereshit
Inserted word 2 in verse 2, chapter 1, parsha Bereshit
Inserted word 3 in verse 2, chapter 1, parsha Bereshit
Inserted word 4 in verse 2, chapter 1, parsha Bereshit
Inserted word 5 in verse 2, chapter 1, parsha Bereshit
Inserted word 6 in verse 2, chapter 1, parsha Beresh

In [12]:
def get_verse(book_name, chapter, verse_range, version_type='plain', word_range=None):
    version_column = {
        'plain': 'word_without_both',
        'vowels': 'word_with_vowels',
        'cantillation': 'word_with_cantillation',
        'both': 'word_full'
    }.get(version_type, 'word_without_both')
    
    conn = sqlite3.connect('tanakh.db')
    cursor = conn.cursor()
    
    # Get book ID
    cursor.execute('SELECT id FROM Books WHERE name = ?', (book_name,))
    book_id = cursor.fetchone()[0]

    # Query to get verses within the range
    query = f'''
    SELECT Verses.number AS verse,
           Words."order" AS word_order,
           Words.{version_column} AS word
    FROM Words
    JOIN Verses ON Words.verse_id = Verses.id
    JOIN Chapters ON Verses.chapter_id = Chapters.id
    WHERE Chapters.book_id = ?
      AND Chapters.number = ?
      AND Verses.number BETWEEN ? AND ?
    ORDER BY Verses.number, Words."order"
    '''
    cursor.execute(query, (book_id, chapter, verse_range[0], verse_range[1]))
    data = cursor.fetchall()
    
    conn.close()
    
    # Process data based on word range
    verses = {}
    for verse_num, word_order, word in data:
        if word_range is None or (word_range[0] <= word_order <= word_range[1]):
            if verse_num not in verses:
                verses[verse_num] = []
            verses[verse_num].append(word)
    
    return {verse: ' '.join(words) for verse, words in verses.items()}


In [7]:
import sqlite3

def get_parsha(parsha_name, version_type='plain'):
    version_column = {
        'plain': 'word_without_both',
        'vowels': 'word_with_vowels',
        'cantillation': 'word_with_cantillation',
        'both': 'word_full'
    }.get(version_type, 'word_without_both')
    
    conn = sqlite3.connect('tanakh.db')
    cursor = conn.cursor()

    # Query to get words for the given parsha
    query = f'''
    SELECT Books.name AS book_name,
           Chapters.number AS chapter,
           Verses.number AS verse,
           Words."order" AS word_order,
           Words.{version_column} AS word
    FROM Words
    JOIN Verses ON Words.verse_id = Verses.id
    JOIN Chapters ON Verses.chapter_id = Chapters.id
    JOIN Books ON Chapters.book_id = Books.id
    WHERE Words.parsha_name = ?
    ORDER BY Books.name, Chapters.number, Verses.number, Words."order"
    '''
    cursor.execute(query, (parsha_name,))
    data = cursor.fetchall()
    
    if not data:
        print(f"No data found for parsha: {parsha_name}")
    
    conn.close()
    
    # Process data
    verses = {}
    for book_name, chapter, verse, word_order, word in data:
        verse_key = (book_name, chapter, verse)
        if verse_key not in verses:
            verses[verse_key] = []
        verses[verse_key].append(word)
    
    return {verse: ' '.join(words) for verse, words in verses.items()}

# Example usage
print(get_parsha("Bereshit", version_type='plain'))
print(get_parsha("Noach", version_type='vowels'))
print(get_parsha("Lech-Lecha", version_type='cantillation'))


{('Genesis', 1, 1): 'בראשית ברא אלהים את השמים ואת הארץ', ('Genesis', 1, 2): 'והארץ היתה תהו ובהו וחשך על פני תהום ורוח אלהים מרחפת על פני המים', ('Genesis', 1, 3): 'ויאמר אלהים יהי אור ויהי אור', ('Genesis', 1, 4): 'וירא אלהים את האור כי טוב ויבדל אלהים בין האור ובין החשך', ('Genesis', 1, 5): 'ויקרא אלהים לאור יום ולחשך קרא לילה ויהי ערב ויהי בקר יום אחד', ('Genesis', 1, 6): 'ויאמר אלהים יהי רקיע בתוך המים ויהי מבדיל בין מים למים', ('Genesis', 1, 7): 'ויעש אלהים את הרקיע ויבדל בין המים אשר מתחת לרקיע ובין המים אשר מעל לרקיע ויהי כן', ('Genesis', 1, 8): 'ויקרא אלהים לרקיע שמים ויהי ערב ויהי בקר יום שני', ('Genesis', 1, 9): 'ויאמר אלהים יקוו המים מתחת השמים אל מקום אחד ותראה היבשה ויהי כן', ('Genesis', 1, 10): 'ויקרא אלהים ליבשה ארץ ולמקוה המים קרא ימים וירא אלהים כי טוב', ('Genesis', 1, 11): 'ויאמר אלהים תדשא הארץ דשא עשב מזריע זרע עץ פרי עשה פרי למינו אשר זרעו בו על הארץ ויהי כן', ('Genesis', 1, 12): 'ותוצא הארץ דשא עשב מזריע זרע למינהו ועץ עשה פרי אשר זרעו בו למינהו וירא אלהים כי טוב

In [10]:
import sqlite3

def check_parsha_names():
    conn = sqlite3.connect('tanakh.db')
    cursor = conn.cursor()

    query = '''
    SELECT DISTINCT parsha_name
    FROM Words
    '''
    cursor.execute(query)
    parshas = cursor.fetchall()
    
    conn.close()
    
    # Convert list of tuples to a list of strings
    parsha_list = [parsha[0] for parsha in parshas]
    
    return parsha_list

print(check_parsha_names())


['Bereshit', 'Noach', 'Lech-Lecha', 'Vayeira', 'Chayei Sarah', 'Toldot', 'Vayetzei', 'Vayishlach', 'Vayeshev', 'Miketz', 'Vayigash', 'Vayechi', 'Shemot', "Va'eira", 'Bo', 'Beshalach', 'Yitro', 'Mishpatim', 'Terumah', 'Tetzaveh', 'Ki Tisa', 'Vayakhel', 'Pekudei', 'Vayikra', 'Tzav', 'Shemini', 'Tazria', 'Metzora', 'Acharei Mot', 'Kedoshim', 'Emor', 'Behar', 'Bechukotai', 'Bamidbar', 'Naso', "Beha'alotcha", 'Shelach', 'Korach', 'Chukat', 'Balak', 'Pinchas', 'Matot', 'Masei', 'Devarim', "Va'etchanan", 'Eikev', "Re'eh", 'Shoftim', 'Ki Teitzei', 'Ki Tavo', 'Nitzavim', 'Vayelech', "Ha'azinu", "V'Zot HaBerachah"]


In [15]:
# Pulling a single verse, plain text
print(get_verse("Genesis", 1, (1, 1), version_type='plain'))

# Pulling a range of verses, with vowels
print(get_verse("Genesis", 1, (1, 5), version_type='vowels'))

# Pulling a verse range with word range, with cantillation
print(get_verse("Genesis", 1, (1, 1), version_type='cantillation', word_range=(1, 3)))



# Pulling a single verse from the parsha 'Bereshit', plain text
print(get_parsha("Bereshit", version_type='plain'))

# Pulling a range of verses from the parsha 'Noach', with vowels
print(get_parsha("Noach", version_type='vowels'))

# Pulling a verse range from the parsha 'Lech-Lecha' with cantillation, and word range
print(get_parsha("Lech-Lecha", version_type='cantillation'))



{1: 'בראשית ברא אלהים את השמים ואת הארץ'}
{1: 'בְּרֵאשִׁית בָּרָא אֱלֹהִים אֵת הַשָּׁמַיִם וְאֵת הָאָֽרֶץ׃', 2: 'וְהָאָרֶץ הָיְתָה תֹהוּ וָבֹהוּ וְחֹשֶׁךְ עַל פְּנֵי תְהוֹם וְרוּחַ אֱלֹהִים מְרַחֶפֶת עַל פְּנֵי הַמָּֽיִם׃', 3: 'וַיֹּאמֶר אֱלֹהִים יְהִי אוֹר וַֽיְהִי אֽוֹר׃', 4: 'וַיַּרְא אֱלֹהִים אֶת הָאוֹר כִּי טוֹב וַיַּבְדֵּל אֱלֹהִים בֵּין הָאוֹר וּבֵין הַחֹֽשֶׁךְ׃', 5: 'וַיִּקְרָא אֱלֹהִים ׀ לָאוֹר יוֹם וְלַחֹשֶׁךְ קָרָא לָיְלָה וַֽיְהִי עֶרֶב וַֽיְהִי בֹקֶר יוֹם'}
{1: 'בראש֖ית בר֣א אלה֑ים'}
{('Genesis', 1, 1): 'בראשית ברא אלהים את השמים ואת הארץ', ('Genesis', 1, 2): 'והארץ היתה תהו ובהו וחשך על פני תהום ורוח אלהים מרחפת על פני המים', ('Genesis', 1, 3): 'ויאמר אלהים יהי אור ויהי אור', ('Genesis', 1, 4): 'וירא אלהים את האור כי טוב ויבדל אלהים בין האור ובין החשך', ('Genesis', 1, 5): 'ויקרא אלהים לאור יום ולחשך קרא לילה ויהי ערב ויהי בקר יום אחד', ('Genesis', 1, 6): 'ויאמר אלהים יהי רקיע בתוך המים ויהי מבדיל בין מים למים', ('Genesis', 1, 7): 'ויעש אלהים את הרקיע ויבדל בין המים אשר מת

In [16]:
import sqlite3

def get_parsha_words(parsha_name, version_type='plain'):
    version_column = {
        'plain': 'word_without_both',
        'vowels': 'word_with_vowels',
        'cantillation': 'word_with_cantillation',
        'both': 'word_full'
    }.get(version_type, 'word_without_both')
    
    conn = sqlite3.connect('tanakh.db')
    cursor = conn.cursor()

    # Query to get words for the given parsha
    query = f'''
    SELECT Books.name AS book_name,
           Chapters.number AS chapter,
           Verses.number AS verse,
           Words."order" AS word_order,
           Words.{version_column} AS word
    FROM Words
    JOIN Verses ON Words.verse_id = Verses.id
    JOIN Chapters ON Verses.chapter_id = Chapters.id
    JOIN Books ON Chapters.book_id = Books.id
    WHERE Words.parsha_name = ?
    ORDER BY Books.name, Chapters.number, Verses.number, Words."order"
    '''
    cursor.execute(query, (parsha_name,))
    data = cursor.fetchall()
    
    if not data:
        print(f"No data found for parsha: {parsha_name}")
        return []
    
    conn.close()
    
    # Process data
    words = [record[-1] for record in data]  # Extract the word from each record
    
    return words

# Example usage
bereshit_words = get_parsha_words("Bereshit", version_type='plain')
print(bereshit_words)


['בראשית', 'ברא', 'אלהים', 'את', 'השמים', 'ואת', 'הארץ', 'והארץ', 'היתה', 'תהו', 'ובהו', 'וחשך', 'על', 'פני', 'תהום', 'ורוח', 'אלהים', 'מרחפת', 'על', 'פני', 'המים', 'ויאמר', 'אלהים', 'יהי', 'אור', 'ויהי', 'אור', 'וירא', 'אלהים', 'את', 'האור', 'כי', 'טוב', 'ויבדל', 'אלהים', 'בין', 'האור', 'ובין', 'החשך', 'ויקרא', 'אלהים', 'לאור', 'יום', 'ולחשך', 'קרא', 'לילה', 'ויהי', 'ערב', 'ויהי', 'בקר', 'יום', 'אחד', 'ויאמר', 'אלהים', 'יהי', 'רקיע', 'בתוך', 'המים', 'ויהי', 'מבדיל', 'בין', 'מים', 'למים', 'ויעש', 'אלהים', 'את', 'הרקיע', 'ויבדל', 'בין', 'המים', 'אשר', 'מתחת', 'לרקיע', 'ובין', 'המים', 'אשר', 'מעל', 'לרקיע', 'ויהי', 'כן', 'ויקרא', 'אלהים', 'לרקיע', 'שמים', 'ויהי', 'ערב', 'ויהי', 'בקר', 'יום', 'שני', 'ויאמר', 'אלהים', 'יקוו', 'המים', 'מתחת', 'השמים', 'אל', 'מקום', 'אחד', 'ותראה', 'היבשה', 'ויהי', 'כן', 'ויקרא', 'אלהים', 'ליבשה', 'ארץ', 'ולמקוה', 'המים', 'קרא', 'ימים', 'וירא', 'אלהים', 'כי', 'טוב', 'ויאמר', 'אלהים', 'תדשא', 'הארץ', 'דשא', 'עשב', 'מזריע', 'זרע', 'עץ', 'פרי', 'עשה', 'פרי', 'ל

In [20]:
import sqlite3
from tf.app import use

# Load the BHSA dataset
TF = use('etcbc/bhsa', hoist=globals())

def get_parsha_structure_with_bhsa(parsha_name, version_type='plain'):
    version_column = {
        'plain': 'word_without_both',
        'vowels': 'word_with_vowels',
        'cantillation': 'word_with_cantillation',
        'both': 'word_full'
    }.get(version_type, 'word_without_both')
    
    conn = sqlite3.connect('tanakh.db')
    cursor = conn.cursor()

    # Query to get words for the given parsha
    query = f'''
    SELECT Books.name AS book_name,
           Chapters.number AS chapter,
           Verses.number AS verse,
           Words."order" AS word_order,
           Words.{version_column} AS word
    FROM Words
    JOIN Verses ON Words.verse_id = Verses.id
    JOIN Chapters ON Verses.chapter_id = Chapters.id
    JOIN Books ON Chapters.book_id = Books.id
    WHERE Words.parsha_name = ?
    ORDER BY Books.name, Chapters.number, Verses.number, Words."order"
    '''
    cursor.execute(query, (parsha_name,))
    data = cursor.fetchall()
    
    conn.close()
    
    if not data:
        print(f"No data found for parsha: {parsha_name}")
        return []

    # Process data and create unique IDs
    parsha_structure = []
    for record in data:
        book_name, chapter, verse, word_order, word = record
        unique_id = f"{book_name}.{chapter}.{verse}.{word_order}"

        # Map to BHSA nodes
        node_id = None
        for node in N():
            if F.etcbcbook.v(node) == book_name and \
               F.chapter.v(node) == str(chapter) and \
               F.verse.v(node) == str(verse) and \
               F.seq.v(node) == str(word_order):
                node_id = node
                break

        if node_id:
            morph = F.voc_lex_utf8.v(node_id)
            gloss = F.gloss_utf8.v(node_id)
        else:
            morph = None
            gloss = None

        parsha_structure.append({
            'unique_id': unique_id,
            'book_name': book_name,
            'chapter': chapter,
            'verse': verse,
            'word_order': word_order,
            'word': word,
            'morphological_info': morph,
            'gloss': gloss
        })
    
    return parsha_structure

# Example usage
bereshit_structure = get_parsha_structure_with_bhsa("Bereshit", version_type='plain')
for entry in bereshit_structure:
    print(entry)


**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


TypeError: 'Nodes' object is not callable

In [27]:
from tf.app import use

# Load the BHSA dataset
TF = use('etcbc/bhsa', hoist=globals())

# Example: Get the first word node in the text
first_word_node = F.otype.s('word')[0]

# Print available features for the first word node
print(f"Node ID: {first_word_node}")
print("Available features and their values:")
available_features = TF.api.Fall()  # Getting all available features

for feature in available_features:
    feature_obj = available_features[feature]
    value = feature_obj.v(first_word_node)
    if value is not None:
        print(f"  {feature}: {value}")

# Print an example structure of the first few word nodes
print("\nExample structure for the first few word nodes:")
for node in F.otype.s('word')[:10]:
    print(f"Node ID: {node}")
    for feature in available_features:
        value = available_features[feature].v(node)
        if value is not None:
            print(f"  {feature}: {value}")
    print("---")


**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


Node ID: 1
Available features and their values:


TypeError: list indices must be integers or slices, not str