In [1]:
torah_links = ['https://www.jewishvirtuallibrary.org/bereishit-genesis-full-text', 
        'https://www.jewishvirtuallibrary.org/shemot-exodus-full-text',
        'https://www.jewishvirtuallibrary.org/vayikra-leviticus-full-text',
        'https://www.jewishvirtuallibrary.org/bamidbar-numbers-full-text',
        'https://www.jewishvirtuallibrary.org/d-varim-deuteronomy-full-text']
prophets_I_links =['https://www.jewishvirtuallibrary.org/yehoshua-joshua-full-text',
                'https://www.jewishvirtuallibrary.org/shoftim-judges-full-text',
                'https://www.jewishvirtuallibrary.org/shmuel-i-samuel-1-full-text',
                'https://www.jewishvirtuallibrary.org/shmuel-ii-samuel-2-full-text',
                'https://www.jewishvirtuallibrary.org/malachim-i-kings-1-full-text',
                'https://www.jewishvirtuallibrary.org/malachim-ii-kings-2-full-text',
                'https://www.jewishvirtuallibrary.org/yeshayahu-isaiah-full-text',
                'https://www.jewishvirtuallibrary.org/yirmeyahu-jeremiah-full-text',
                'https://www.jewishvirtuallibrary.org/yichezkel-ezekiel-full-text']
prophets_II_links=['https://www.jewishvirtuallibrary.org/book-of-hosea',
                  'https://www.jewishvirtuallibrary.org/book-of-joel',
                  'https://www.jewishvirtuallibrary.org/book-of-amos',
                  'https://www.jewishvirtuallibrary.org/book-of-obadiah',
                  'https://www.jewishvirtuallibrary.org/book-of-jonah',
                  'https://www.jewishvirtuallibrary.org/book-of-micah',
                  'https://www.jewishvirtuallibrary.org/book-of-nahum',
                  'https://www.jewishvirtuallibrary.org/book-of-habakkuk',
                  'https://www.jewishvirtuallibrary.org/book-of-zephaniah',
                  'https://www.jewishvirtuallibrary.org/book-of-haggai',
                  'https://www.jewishvirtuallibrary.org/book-of-zechariah',
                  'https://www.jewishvirtuallibrary.org/book-of-malachi']


In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

from nltk.tokenize import word_tokenize


In [3]:
def loadPage(url):
    response = requests.get(url, headers={'User-agent': "Mozilla/5.0"})
    return response.text

In [4]:
torah_pages = [loadPage(a) for a in torah_links]

In [5]:
torah_pages_soup = [BeautifulSoup(a, 'lxml') for a in torah_pages]

In [6]:
books = [[p.text.split(" ") for p in a.find_all('p')] for a in torah_pages_soup]

indices = []
book_index = 0
for torah_book in books:
    book_index+=1
    for i in range(3):
        torah_book.pop(-1)
    new_book = []
    for verse in torah_book:
        while '' in verse:
            verse.remove('')
        verse[0] = str(book_index)+":"+verse[0]

In [7]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

verse_tokenizer = RegexpTokenizer("[0-9]+:[0-9]+:[0-9]+[^:]+[^0-9:]")

books_assembled = [" ".join(verse) for book in books for verse in book]
books_fused = " ".join(books_assembled)
verses = verse_tokenizer.tokenize(books_fused)
verses_minus_verses = [" ".join(verse.split(" ")[1:]) for verse in verses]
verse_indices = np.array([verse.split(" ")[0].split("\xa0")[0].split(":") for verse in verses])
verse_indices = np.array([[int(x) for x in line] for line in verse_indices])

In [8]:
from collections import defaultdict
def group_by_chapter(verse_indices, verses):
    by_chap = defaultdict(str)
    for verse, indices in list(zip(verses, verse_indices)):
        by_chap[str(indices[0:2])] += verse + " "
    return by_chap
    
grouped_chapters = list(zip(*list(group_by_chapter(verse_indices, verses_minus_verses).items())))
chapters = list(grouped_chapters[1])
chapter_indices = list(grouped_chapters[0])

In [9]:
vectorized = pd.DataFrame(chapters)
vectorized.to_csv('Torah_Chapters.csv')

In [10]:
pd.DataFrame(verses_minus_verses).to_csv('Torah_Verses.csv')