In [None]:
#This is a list of links to books of the Torah. I ended up only using the Torah proper, though.

torah_links = ['https://www.jewishvirtuallibrary.org/bereishit-genesis-full-text', 
        'https://www.jewishvirtuallibrary.org/shemot-exodus-full-text',
        'https://www.jewishvirtuallibrary.org/vayikra-leviticus-full-text',
        'https://www.jewishvirtuallibrary.org/bamidbar-numbers-full-text',
        'https://www.jewishvirtuallibrary.org/d-varim-deuteronomy-full-text']
prophets_I_links =['https://www.jewishvirtuallibrary.org/yehoshua-joshua-full-text',
                'https://www.jewishvirtuallibrary.org/shoftim-judges-full-text',
                'https://www.jewishvirtuallibrary.org/shmuel-i-samuel-1-full-text',
                'https://www.jewishvirtuallibrary.org/shmuel-ii-samuel-2-full-text',
                'https://www.jewishvirtuallibrary.org/malachim-i-kings-1-full-text',
                'https://www.jewishvirtuallibrary.org/malachim-ii-kings-2-full-text',
                'https://www.jewishvirtuallibrary.org/yeshayahu-isaiah-full-text',
                'https://www.jewishvirtuallibrary.org/yirmeyahu-jeremiah-full-text',
                'https://www.jewishvirtuallibrary.org/yichezkel-ezekiel-full-text']
prophets_II_links=['https://www.jewishvirtuallibrary.org/book-of-hosea',
                  'https://www.jewishvirtuallibrary.org/book-of-joel',
                  'https://www.jewishvirtuallibrary.org/book-of-amos',
                  'https://www.jewishvirtuallibrary.org/book-of-obadiah',
                  'https://www.jewishvirtuallibrary.org/book-of-jonah',
                  'https://www.jewishvirtuallibrary.org/book-of-micah',
                  'https://www.jewishvirtuallibrary.org/book-of-nahum',
                  'https://www.jewishvirtuallibrary.org/book-of-habakkuk',
                  'https://www.jewishvirtuallibrary.org/book-of-zephaniah',
                  'https://www.jewishvirtuallibrary.org/book-of-haggai',
                  'https://www.jewishvirtuallibrary.org/book-of-zechariah',
                  'https://www.jewishvirtuallibrary.org/book-of-malachi']
writings_links = ['https://www.jewishvirtuallibrary.org/tehillim-psalms-full-text',
                 'https://www.jewishvirtuallibrary.org/mishlei-proverbs-full-text',
                 'https://www.jewishvirtuallibrary.org/iyov-job-full-text',
                 'https://www.jewishvirtuallibrary.org/shir-hashirim-song-of-songs-full-text',
                 'https://www.jewishvirtuallibrary.org/ruth-full-text',
                 'https://www.jewishvirtuallibrary.org/eichah-lamentations-full-text',
                 'https://www.jewishvirtuallibrary.org/kohelet-ecclesiastes-full-text',
                 'https://www.jewishvirtuallibrary.org/esther-full-text',
                 'https://www.jewishvirtuallibrary.org/daniel-full-text',
                 'https://www.jewishvirtuallibrary.org/ezra-full-text',
                 'https://www.jewishvirtuallibrary.org/nehemiah-full-text',
                 'https://www.jewishvirtuallibrary.org/divrei-hayamim-i-chronicles-1-full-text',
                 'https://www.jewishvirtuallibrary.org/divrei-hayamim-ii-chronicles-2-full-text']

In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from nltk.tokenize import word_tokenize

In [3]:
def loadPage(url):
    '''Loads the text of a page.'''
    response = requests.get(url, headers={'User-agent': "Mozilla/5.0"})
    return response.text

In [4]:
#For each portion of the Tanakh,grab its text and convert it to BeautifulSoup. Then, split it into words. 

torah_pages = [loadPage(a) for a in torah_links]
p1_pages = [loadPage(a) for a in prophets_I_links]
p2_pages= [loadPage(a) for a in prophets_II_links]
writing_pages= [loadPage(a) for a in writings_links]

torah_pages_soup = [BeautifulSoup(a, 'lxml') for a in torah_pages]
p1_pages_soup = [BeautifulSoup(a, 'lxml') for a in p1_pages]
p2_pages_soup = [BeautifulSoup(a, 'lxml') for a in p2_pages]
writing_pages_soup = [BeautifulSoup(a, 'lxml') for a in writing_pages]

torah_books = [[p.text.split(" ") for p in a.find_all('p')] for a in torah_pages_soup]
p1_books = [[p.text.split(" ") for p in a.find_all('p')] for a in p1_pages_soup]
p2_books = [[p.text.split(" ") for p in a.find_all('p')] for a in p2_pages_soup]
writing_books = [[p.text.split(" ") for p in a.find_all('p')] for a in writing_pages_soup]

In [6]:
#Grab the chapters of each book. 
books = torah_books#+p1_books

indices = []
book_index = 0
for torah_book in books:
    book_index+=1
    for i in range(3):
        torah_book.pop(-1)
    new_book = []
    for verse in torah_book:
        while '' in verse:
            verse.remove('')
        verse[0] = str(book_index)+":"+verse[0]

In [7]:
from nltk.tokenize import RegexpTokenizer

verse_tokenizer = RegexpTokenizer("[0-9]+:[0-9]+:[0-9]+[^:]+[^0-9:]")

#Realized we ran into an issue- several verses were smushed into one on the page, going 
#(ex.) 1:18 words words words 1:19 words words words instead of having a new line for each verse. 
#In order to solve this, I joined then all into a single text blob, then split on the verses.

books_assembled = [" ".join(verse) for book in books for verse in book]
books_fused = " ".join(books_assembled)
verses = verse_tokenizer.tokenize(books_fused)
verses_minus_verses = [" ".join(verse.split(" ")[1:]) for verse in verses]
verse_indices = np.array([verse.split(" ")[0].split("\xa0")[0].split(":") for verse in verses])
verse_indices = np.array([[int(x) for x in line] for line in verse_indices])

In [8]:
from collections import defaultdict
def group_by_chapter(verse_indices, verses):
    #Groupby operation, essentially. 
    by_chap = defaultdict(str)
    for verse, indices in list(zip(verses, verse_indices)):
        by_chap[str(indices[0:2])] += verse + " "
    return by_chap
    
grouped_chapters = list(zip(*list(group_by_chapter(verse_indices, verses_minus_verses).items())))
chapters = list(grouped_chapters[1])
chapter_indices = list(grouped_chapters[0])

In [9]:
pd.DataFrame(chapters).to_csv('Torah_Chapters.csv')
pd.DataFrame(verses_minus_verses).to_csv('Torah_Verses.csv')

In [10]:
pd.DataFrame(chapter_indices).to_csv('Chapter Indices.csv')

In [37]:
#Text labeling. Labeling is defined based on https://en.wikipedia.org/wiki/Composition_of_the_Torah#Nature_and_extent_of_the_sources

book = verse_indices[:, 0]
chapter = verse_indices[:, 1]
verse = verse_indices[:, 2]

In [1]:
labels = np.array([""]*len(verse_indices))
labels[(book==1)&(chapter==1)] = "p"
labels[(book==1)&(chapter==2)&(verse<4)] = "p"
labels[(book==1)&(chapter==2)&(verse>=4)] = "y"
labels[(book==1)&(chapter>=3)&(chapter<=35)] = "y"
labels[(book==1)&(chapter>=36)]="p" #(unclear if this is accurate; there's ambiguity.)

labels[(book==2)] = "y"
labels[(book==2)&(chapter>=25)&(chapter<=31)] = "p"
labels[(book==2)&(chapter>=35)&(chapter<=40)] = "p"

labels[(book==3)]="p"

labels[(book==4)&(chapter>=1)&(chapter<=9)] = "p"
labels[(book==4)&(chapter==10)]='y'
labels[(book==4)&(chapter==10)&(verse<=28)] = "p"
labels[(book==4)&(chapter>=11)&(chapter<=14)]='y'
labels[(book==4)&(chapter>=15)&(chapter<=20)]='p'
labels[(book==4)&(chapter>=21)&(chapter<=24)]='y'
labels[(book==4)&(chapter>=25)&(chapter<=31)]='p'
labels[(book==4)&(chapter>=32)&(chapter<=32)]='y'
labels[(book==4)&(chapter>=33)&(chapter<=36)]='p'
labels[(book==5)]='d'

NameError: name 'np' is not defined

In [62]:
pd.DataFrame(labels).to_csv('Verse_Labels.csv')

In [58]:
from collections import Counter
Counter(labels)

Counter({'p': 2089, 'y': 1675, 'd': 661})