# Bible Notebook
This is a Jupyter Notebook project for performing analysis on the bible.
There are several translations of the bible. The translations that I am using
for this project is RSV, NAB, DR and LV.

I am using Apache Spark and Pandas frameworks. I also am using NLTK for the content analysis.


In [None]:
from pyspark import SparkContext, SparkConf
import string
import re
import subprocess
import sys

In [None]:
conf = SparkConf().setAppName('Bible Study App')
sc=SparkContext(conf = conf)

In [None]:
# book information
hdfs_dir    = '/user/thebible'
translation = 'dr'
book        = '01-Genesis-text.txt'
book_filename = f'{hdfs_dir}/{translation}/{book}'

## Counting words in a book

In [None]:
## Examples running it separately by lines ...
raw_text  = sc.textFile(book_filename)

flatmap      = raw_text.flatMap(lambda line: line.translate(str.maketrans('', '', string.punctuation)).split(' '))
filtered     = flatmap.filter(lambda word: word != '' and word.isnumeric() == False)
filtered_map = filtered.map(lambda word: (word, 1))
reduced      = filtered_map.reduceByKey(lambda a, b: a + b)

sorted_counts = reduced.sortByKey()

for count in sorted_counts.collect():
    print(count)

In [None]:
# How many chapters does this book have?
def get_chapter_headers(text):
    return text.map(lambda line: line.startWith("*** chapter"))

In [None]:
raw_text.filter(lambda line: line.startswith('***')).collect()

## Count words in a text file

In [None]:
def get_unique_words(text_file):
    text = sc.textFile(text_file)
    return text.flatMap(lambda line: line.translate(str.maketrans('', '', string.punctuation)).split(' '))\
             .filter(lambda word: word != '' and word.isnumeric() == False) \
             .map(lambda word: (word, 1))\
             .reduceByKey(lambda a, b: a + b)\
             .sortByKey()

def count_all_words(counted_words):
    total_words = 0
    for word in counted_words.collect():
        total_words += word[1]
        
    return total_words

In [None]:
## Number of words by books
book_name = '01-Genesis'
book_file_name = f'/user/thebible/rsv/{book_name}-text.txt'
unique_words = get_unique_words(book_file_name)

print('The book of {} has {} unique words'.format(book_name, unique_words.count()))
print('The book of {} has {} words'.format(book_name, count_all_words(unique_words)))

In [None]:

def get_file_names(dir_name):
    buf = ''
    file_names = []
    with subprocess.Popen(["hdfs", "dfs", "-ls", dir_name], stdout=subprocess.PIPE) as proc:
        buf += re.sub('\t', '', str(proc.stdout.read()))
    for line in buf.split('\\n'):
        tokens = []
        for token in line.split(' '):
            if token.strip() != '':
                tokens.append(token)
        if len(tokens) < 8:
            continue
        tokens = tokens[7].split('/')
        file_names.append(tokens[len(tokens) - 1])
    return file_names

class BookMetadata:
    book_name = ''
    book_id = ''
    translation = ''
    file_name = ''
    file_size = 0
    unique_words_count = 0
    total_words_count = 0
    num_chapters = 0
    num_verses = 0
    unique_words_file_name = ''

    most_unique_words = False
    least_unique_words = False
    most_words = False
    least_words = False
    
translation = 'rsv'
translation_dir = '/user/thebible/rsv/'
file_name_ext = '-text.txt'
file_names = get_file_names(translation_dir)
file_names.sort()
book_metadatas_by_names = {}

for file_name in file_names:
    book_metadata = BookMetadata()
    book_metadata.translation = translation
    book_metadata.book_name = file_name[3:][:-9]
    book_metadata.book_id = file_name[:2]
    book_metadata.file_name = file_name
    
    book_file_name = translation_dir + file_name
    unique_words = get_unique_words(book_file_name)
    
    book_metadata.unique_words_count = unique_words.count()
    book_metadata.total_words_count = count_all_words(unique_words)
    
    print('The book of {} has {} unique words'.format(book_metadata.book_name, unique_words.count()))
    print('The book of {} has {} words'.format(book_metadata.book_name, count_all_words(unique_words)))
    book_metadatas_by_names[book_metadata.book_name] = book_metadata
    

In [None]:

print('ID, Name, File Name, Unique Words, All Words')
book_with_most_unique_words = ''
book_with_least_unique_words = ''
book_with_most_words = ''
book_with_least_words = ''
most_unique_words = -1
least_unique_words = sys.maxsize
most_words = -1
least_words = sys.maxsize

for book_name in book_metadatas_by_names:
    book_metadata = book_metadatas_by_names[book_name]
    
    if book_metadata.unique_words_count > most_unique_words: 
        most_unique_words = book_metadata.unique_words_count
        book_with_most_unique_words = book_metadata.book_name
        
    if book_metadata.unique_words_count < least_unique_words:
        least_unique_words = book_metadata.unique_words_count
        book_with_least_unique_words = book_metadata.book_name

    if book_metadata.total_words_count > most_words: 
        most_words = book_metadata.total_words_count
        book_with_most_words = book_metadata.book_name
        
    if book_metadata.total_words_count < least_words:
        least_words = book_metadata.total_words_count
        book_with_least_words = book_metadata.book_name
        
    print('{}, {}, {}, {}, {}'.\
          format(book_metadata.book_id, \
                 book_metadata.book_name, \
                 book_metadata.file_name, \
                 book_metadata.unique_words_count,\
                 book_metadata.total_words_count))

print('\n,Statistic, Name, Count')
print (',most unique words, {}, {}'.format(book_with_most_unique_words, most_unique_words))
print (',least unique words, {}, {}'.format(book_with_least_unique_words, least_unique_words))
print (',most words, {}, {}'.format(book_with_most_words, most_words))
print (',least words, {}, {}'.format(book_with_least_words, least_words))


## Counting Chapters

In [None]:
## Examples running it separately by lines ...
# *** the book of Matthew
#
# *** Matthew 1 ***
#
# 1 The book of the genealogy of Jesus Christ, the son of David, the son of Abraham.

class Book:
    book_name = ''
    book_id = ''
    metadata = None
    chapters = []
    
class Chapter:
    chapter_no = 0
    verses = []
    total_words = 0
    
# text_file = '/user/thebible/rsv/01-Genesis-text.txt'
text_file = '/user/thebible/dr/01-Genesis-text.txt'
text = sc.textFile(text_file)
book = Book()
book.chapters.append(Chapter())
chapter = None
chapter_no = 0

for line in text.collect():
    if line.startswith('***') and line.endswith('***'):        
        chapter = Chapter()
        chapter_no += 1
        chapter.verses = []
        chapter.chapter_no = chapter_no
        book.chapters.append(chapter)
    else:
        if chapter != None and line.strip() != '':
            if line[0:1].isnumeric():
                book.chapters[chapter_no].verses.append(line)


In [None]:
len(book.chapters)-1

In [None]:
verse_count = 0
for chapter in book.chapters:
    print(chapter.chapter_no, len(chapter.verses))
    for verse in chapter.verses:
        verse_count += 1
        print(verse)

In [None]:
verse_count