# Bible Study Python Code

In [1]:
from pyspark import SparkContext, SparkConf
import string
import re
import subprocess
import sys


In [2]:
conf = SparkConf().setAppName('Bible Study App')
sc=SparkContext(conf = conf)

## Counting words in a book

In [3]:
## Examples running it separately by lines ...
text_file = '/user/thebible/rsv/47-Matthew-text.txt'
text = sc.textFile(text_file)
flatmap = text.flatMap(lambda line: line.translate(str.maketrans('', '', string.punctuation)).split(' '))

filtered = flatmap.filter(lambda word: word != '' and word.isnumeric() == False)

filtered_map = filtered.map(lambda word: (word, 1))

reduced = filtered_map.reduceByKey(lambda a, b: a + b)

sorted_counts = reduced.sortByKey()

for count in sorted_counts.collect():
    print(count)

('A', 8)
('Abel', 1)
('Abijah', 2)
('Abiud', 2)
('Abraham', 7)
('According', 1)
('Achim', 2)
('After', 4)
('Afterward', 2)
('Again', 10)
('Ahaz', 2)
('All', 6)
('Alphaeus', 1)
('Am', 1)
('Amminadab', 2)
('Amos', 2)
('An', 4)
('And', 282)
('Andrew', 2)
('Another', 3)
('Archelaus', 1)
('Are', 8)
('Arimathea', 1)
('As', 17)
('Asa', 2)
('Ask', 1)
('At', 6)
('Azor', 2)
('Babylon', 4)
('Baptist', 7)
('BarJona', 1)
('Barabbas', 5)
('Barachiah', 1)
('Bartholomew', 1)
('Be', 2)
('Bear', 1)
('Because', 2)
('Beelzebul', 3)
('Before', 2)
('Begone', 1)
('Behold', 12)
('Besides', 1)
('Bethany', 2)
('Bethlehem', 5)
('Bethphage', 1)
('Bethsaida', 1)
('Beware', 4)
('Bind', 1)
('Blessed', 13)
('Blood', 1)
('Boaz', 2)
('Bread', 1)
('Bring', 2)
('Brother', 1)
('But', 99)
('By', 1)
('Caesar', 2)
('Caesarea', 1)
('Caesars', 2)
('Caiaphas', 2)
('Call', 1)
('Can', 2)
('Canaanite', 1)
('Cananaean', 1)
('Capernaum', 4)
('Certainly', 1)
('Chorazin', 1)
('Christ', 16)
('Christs', 1)
('Come', 6)
('Command', 1)
('C

## Count words in a text file

In [4]:
def get_unique_words(text_file):
    text = sc.textFile(text_file)
    return text.flatMap(lambda line: line.translate(str.maketrans('', '', string.punctuation)).split(' '))\
             .filter(lambda word: word != '' and word.isnumeric() == False) \
             .map(lambda word: (word, 1))\
             .reduceByKey(lambda a, b: a + b)\
             .sortByKey()

def count_all_words(counted_words):
    total_words = 0
    for word in counted_words.collect():
        total_words += word[1]
        
    return total_words

In [5]:
## Number of words by books
book_name = 'Matthew'
book_file_name = '/user/thebible/rsv/47-{}-text.txt'.format(book_name)
unique_words = get_unique_words(book_file_name)

print('The book of {} has {} unique words'.format(book_name, unique_words.count()))
print('The book of {} has {} words'.format(book_name, count_all_words(unique_words)))

The book of Matthew has 2330 unique words
The book of Matthew has 22474 words


In [7]:

def get_file_names(dir_name):
    buf = ''
    file_names = []
    with subprocess.Popen(["hdfs", "dfs", "-ls", dir_name], stdout=subprocess.PIPE) as proc:
        buf += re.sub('\t', '', str(proc.stdout.read()))
    for line in buf.split('\\n'):
        tokens = []
        for token in line.split(' '):
            if token.strip() != '':
                tokens.append(token)
        if len(tokens) < 8:
            continue
        tokens = tokens[7].split('/')
        file_names.append(tokens[len(tokens) - 1])
    return file_names

class BookMetadata:
    book_name = ''
    book_id = ''
    translation = ''
    file_name = ''
    file_size = 0
    unique_words_count = 0
    total_words_count = 0
    num_chapters = 0
    num_verses = 0
    unique_words_file_name = ''

    most_unique_words = False
    least_unique_words = False
    most_words = False
    least_words = False
    
translation = 'rsv'
translation_dir = '/user/thebible/rsv/'
file_name_ext = '-text.txt'
file_names = get_file_names(translation_dir)
file_names.sort()
book_metadatas_by_names = {}

for file_name in file_names:
    book_metadata = BookMetadata()
    book_metadata.translation = translation
    book_metadata.book_name = file_name[3:][:-9]
    book_metadata.book_id = file_name[:2]
    book_metadata.file_name = file_name
    
    book_file_name = translation_dir + file_name
    unique_words = get_unique_words(book_file_name)
    
    book_metadata.unique_words_count = unique_words.count()
    book_metadata.total_words_count = count_all_words(unique_words)
    
    print('The book of {} has {} unique words'.format(book_metadata.book_name, unique_words.count()))
    print('The book of {} has {} words'.format(book_metadata.book_name, count_all_words(unique_words)))
    book_metadatas_by_names[book_metadata.book_name] = book_metadata
    

The book of Genesis has 2812 unique words
The book of Genesis has 36559 words
The book of Exodus has 2247 unique words
The book of Exodus has 31068 words
The book of Leviticus has 1543 unique words
The book of Leviticus has 23440 words
The book of Numbers has 2315 unique words
The book of Numbers has 31405 words
The book of Deuteronomy has 2291 unique words
The book of Deuteronomy has 27746 words
The book of Joshua has 1790 unique words
The book of Joshua has 17784 words
The book of Judges has 1915 unique words
The book of Judges has 18276 words
The book of Ruth has 547 unique words
The book of Ruth has 2463 words
The book of 1-Samuel has 2121 unique words
The book of 1-Samuel has 24133 words
The book of 2-Samuel has 2093 unique words
The book of 2-Samuel has 19741 words
The book of 1-Kings has 2153 unique words
The book of 1-Kings has 23551 words
The book of 2-Kings has 2070 unique words
The book of 2-Kings has 22772 words
The book of 1-Chronicles has 2689 unique words
The book of 1-C

In [8]:

print('ID, Name, File Name, Unique Words, All Words')
book_with_most_unique_words = ''
book_with_least_unique_words = ''
book_with_most_words = ''
book_with_least_words = ''
most_unique_words = -1
least_unique_words = sys.maxsize
most_words = -1
least_words = sys.maxsize

for book_name in book_metadatas_by_names:
    book_metadata = book_metadatas_by_names[book_name]
    
    if book_metadata.unique_words_count > most_unique_words: 
        most_unique_words = book_metadata.unique_words_count
        book_with_most_unique_words = book_metadata.book_name
        
    if book_metadata.unique_words_count < least_unique_words:
        least_unique_words = book_metadata.unique_words_count
        book_with_least_unique_words = book_metadata.book_name

    if book_metadata.total_words_count > most_words: 
        most_words = book_metadata.total_words_count
        book_with_most_words = book_metadata.book_name
        
    if book_metadata.total_words_count < least_words:
        least_words = book_metadata.total_words_count
        book_with_least_words = book_metadata.book_name
        
    print('{}, {}, {}, {}, {}'.\
          format(book_metadata.book_id, \
                 book_metadata.book_name, \
                 book_metadata.file_name, \
                 book_metadata.unique_words_count,\
                 book_metadata.total_words_count))

print('\n,Statistic, Name, Count')
print (',most unique words, {}, {}'.format(book_with_most_unique_words, most_unique_words))
print (',least unique words, {}, {}'.format(book_with_least_unique_words, least_unique_words))
print (',most words, {}, {}'.format(book_with_most_words, most_words))
print (',least words, {}, {}'.format(book_with_least_words, least_words))


ID, Name, File Name, Unique Words, All Words
01, Genesis, 01-Genesis-text.txt, 2812, 36559
02, Exodus, 02-Exodus-text.txt, 2247, 31068
03, Leviticus, 03-Leviticus-text.txt, 1543, 23440
04, Numbers, 04-Numbers-text.txt, 2315, 31405
05, Deuteronomy, 05-Deuteronomy-text.txt, 2291, 27746
06, Joshua, 06-Joshua-text.txt, 1790, 17784
07, Judges, 07-Judges-text.txt, 1915, 18276
08, Ruth, 08-Ruth-text.txt, 547, 2463
09, 1-Samuel, 09-1-Samuel-text.txt, 2121, 24133
10, 2-Samuel, 10-2-Samuel-text.txt, 2093, 19741
11, 1-Kings, 11-1-Kings-text.txt, 2153, 23551
12, 2-Kings, 12-2-Kings-text.txt, 2070, 22772
13, 1-Chronicles, 13-1-Chronicles-text.txt, 2689, 18888
14, 2-Chronicles, 14-2-Chronicles-text.txt, 2336, 24970
15, Ezra, 15-Ezra-text.txt, 1217, 7101
16, Nehemiah, 16-Nehemiah-text.txt, 1615, 10069
17, Tobit, 17-Tobit-text.txt, 1197, 6954
18, Judith, 18-Judith-text.txt, 1687, 10613
19, Esther, 19-Esther-text.txt, 1383, 8214
20, 1-Maccabees, 20-1-Maccabees-text.txt, 2435, 21897
21, 2-Maccabees, 21-

## Counting Chapters

In [91]:
## Examples running it separately by lines ...
# *** the book of Matthew
#
# *** Matthew 1 ***
#
# 1 The book of the genealogy of Jesus Christ, the son of David, the son of Abraham.

class Book:
    book_name = ''
    book_id = ''
    metadata = None
    chapters = []
    
class Chapter:
    chapter_no = 0
    verses = []
    total_words = 0
    
text_file = '/user/thebible/rsv/47-Matthew-text.txt'
text = sc.textFile(text_file)
book = Book()
book.chapters.append(Chapter())
chapter = None
chapter_no = 0

for line in text.collect():
    if line.startswith('***') and line.endswith('***'):        
        chapter = Chapter()
        chapter_no += 1
        chapter.verses = []
        chapter.chapter_no = chapter_no
        book.chapters.append(chapter)
    else:
        if chapter != None and line.strip() != '':
            if line[0:1].isnumeric():
                book.chapters[chapter_no].verses.append(line)


In [92]:
len(book.chapters)-1

28

In [93]:
for chapter in book.chapters:
    print(chapter.chapter_no, len(chapter.verses))
    for verse in chapter.verses:
        print(verse)

0 0
1 25
1 The book of the genealogy of Jesus Christ, the son of David, the son of Abraham.
2 Abraham was the father of Isaac, and Isaac the father of Jacob, and Jacob the father of Judah and his brothers,
3 and Judah the father of Perez and Zerah by Tamar, and Perez the father of Hezron, and Hezron the father of Ram,
4 and Ram the father of Ammin'adab, and Ammin'adab the father of Nahshon, and Nahshon the father of Salmon,
5 and Salmon the father of Bo'az by Rahab, and Bo'az the father of Obed by Ruth, and Obed the father of Jesse,
6 and Jesse the father of David the king. And David was the father of Solomon by the wife of Uri'ah,
7 and Solomon the father of Rehobo'am, and Rehobo'am the father of Abi'jah, and Abi'jah the father of Asa,
8 and Asa the father of Jehosh'aphat, and Jehosh'aphat the father of Joram, and Joram the father of Uzzi'ah,
9 and Uzzi'ah the father of Jotham, and Jotham the father of Ahaz, and Ahaz the father of Hezeki'ah,
10 and Hezeki'ah the father of Manas'seh, a