# Jupyter Notebook on the Bible
This is a Jupyter Notebook project for performing analysis on the bible.
There are several translations of the bible. The translations that I am using
for this project is RSV, NAB, DR and LV.

I am using Apache Spark and Pandas frameworks. In addition, I am using the Natural Language Took Kit software package located here: `https://nltk.org`
        

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import nltk, re, pprint
import nltk.data
import subprocess
import json
import pandas as pandas
import pydoop.hdfs as hdfs
from nltk import word_tokenize
from nltk.probability import FreqDist

## Setting up Spark runtime
Getting two important handles.

- Get SparkSession
- Get SparkContext

In [None]:
spark = SparkSession.builder.appName('Reading the Bible').getOrCreate()
sc    = spark.sparkContext

## Common functions for extracting the content of the bible
Note that the bible is stored by translations. There are four (4) translation
this project is using: 

- Douay-Rheims (DR)
- Revised Standard Version (RSV)
- Latin Vulgate (LV)
- New American Bible (NAB)

These are stored on sepearate directories.

In [None]:
# Constants related to the Bible

hdfs_dir        = '/user/thebible/'
tmp_dir         = '/tmp/'
rsv_translation = 'rsv'
dr_translation  = 'dr'
nab_translation = 'nab'
lv_translation  = 'lv'
filename_suffix = '-text.txt'
translations    = [dr_translation, rsv_translation, lv_translation, nab_translation]

column_translation_name = 'Translation'
column_book_name        = 'Book Name'
column_chapter_name     = 'Chapter'
column_verse_name       = 'Verse'
column_text_name        = 'Text'
verse_column_names      = [column_translation_name, column_book_name, \
                           column_chapter_name, column_verse_name, \
                           column_text_name]

In [None]:
# Get fully-qualified filename for a given book for a translation (dr, rsv, nab, lv)
def get_fq_filename(base_dir, translation, book_filename):
    return f'{base_dir}{translation}/{book_filename}'


def get_csv_filename(dir_name, translation):
    return f'{dir_name}{translation}.csv'

def get_json_filename(dir_name, translation):
    return f'{dir_name}{translation}.json'

In [None]:
# return an RDD of strings.
"""
https://spark.apache.org/docs/2.1.0/api/python/pyspark.html#pyspark.RDD:

A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. 
Represents an immutable, partitioned collection of elements 
that can be operated on in parallel.
"""
def read_file(filename):
    return sc.textFile(filename)
              
def chapters_verses(rdd):
    return rdd.filter(lambda line: line and not line.startswith('*** the book'))

"""
Get a list of file names for the books based on a given translation
"""
def get_filenames_for_books(base_dir, translation):
    translation_dir = f'{base_dir}/{translation}/*{filename_suffix}'
    buf = ''
    file_names = []
    with subprocess.Popen(["hdfs", "dfs", "-ls", translation_dir], stdout=subprocess.PIPE) as proc:
        buf += re.sub('\t', '', str(proc.stdout.read()))
    for line in buf.split('\\n'):
        tokens = []
        for token in line.split(' '):
            if token.strip() != '':
                tokens.append(token)
        if len(tokens) < 8:
            continue
        tokens = tokens[7].split('/')
        file_names.append(tokens[len(tokens) - 1])
    return file_names


In [None]:
# translation | book_name | chapter_no | verse_no | text|

"""
for a given translation and a book name, read the content and 
extract only the actual content of the text for the book. Each
verse is then organized as a row that contains: translation,
name of the book, chapter number, verse number and the text of 
the verse. The row or the verse is added to a given array
called 'verses'.
"""
def load_book(verses, translation, base_dir, book_filename):
    rdd  = read_file(get_fq_filename(base_dir, translation, book_filename))
    # skip until *** is found.
    
    cv_rdd = chapters_verses(rdd)
    book_name = book_filename.replace(filename_suffix, '')

    chapter_no = 0
    verse_no = 0
    
    for line in cv_rdd.collect():
        if line.startswith('***'):
            chapter_no += 1
            verse_no = 0
        else:
            if line[0].isdigit():
                verse_no += 1
                verses.append([translation, book_name, chapter_no, verse_no, line[line.find(' ')+1:]])


In [None]:
# load the entire bible by translation
def load_the_bible(base_dir, translation):
    filenames = get_filenames_for_books(base_dir, translation)
    verses = []
    for filename in filenames:
        print(filename)
        load_book(verses, translation, base_dir, filename)

    pd = spark.createDataFrame(verses).toPandas()
    pd.columns = verse_column_names
    return pd

In [None]:
# print out the info from the wc program
def print_wc (filename):
    with subprocess.Popen(["wc", filename], stdout=subprocess.PIPE) as proc:
        content = str(proc.stdout.read()).replace('b\'', '')\
                                         .strip()\
                                         .replace('\\n\'', '')\
                                         .replace('\t','')
    # Removing extra spaces between words ... then split by space    
    content = re.sub(' +', ' ', content).split(' ')
    print(f'file: {content[3]}')
    print(f'  {content[0]} lines')
    print(f'  {content[1]} words')
    print(f'  {content[2]} chateracters')

In [None]:
# Load the bible from a csv file into a pandas dataframe

def load_csv_into_pandas(dir_name, translation):
    the_bible_filename = f'{dir_name}/{translation}.csv'
    with hdfs.open(the_bible_filename) as reader:
        pd_bible = pandas.read_csv(reader)
        
    return pd_bible

In [None]:
# Converting a pandas dataframe into a dictionary of book names.
# Each book is a dictionary of chapter numbers.
# Each chapter is a dictionary of verse numbers.
# Each verse number contains the text of the verse.

def pandas_as_dictionary(pd_bible):
    the_bible = {}
    for index, verse in pd_bible.iterrows():
        book_name = verse[column_book_name]
        chapter_no = verse[column_chapter_name]
        verse_no = verse[column_verse_name]
        verse_text = verse[column_text_name]
        if book_name not in the_bible:
            the_bible[book_name] = {}
        if chapter_no not in the_bible[book_name]:
            the_bible[book_name][chapter_no] = {}
        the_bible[book_name][chapter_no][verse_no] = verse_text
    
    return the_bible

In [None]:
# Converting csv file into a json file.
def convert_csv_to_json(hdfs_dir, local_dir, translation):
    pd_bible = load_csv_into_pandas(hdfs_dir, translation)
    the_bible = pandas_as_dictionary(pd_bible)
    the_bible_in_json = json.dumps(the_bible, indent=4)
    with open(get_json_filename(local_dir, translation), 'w') as writer:
        writer.write(the_bible_in_json)    

## Running scenarios ...
- Try out a couple of translation individually to make sure the code
is working coorectly.


In [None]:
book_name       = '01-Genesis'
book_filename   = f'{book_name}{filename_suffix}'

# Fully-Qualified filenames:
dr_book_fq_filename  = f'{hdfs_dir}/{dr_translation}/{book_filename}'
rsv_book_fq_filename = f'{hdfs_dir}/{rsv_translation}/{book_filename}'
print(f'fully-qualified book file name: {dr_book_fq_filename}')
print(f'fully-qualified book file name: {rsv_book_fq_filename}')

## Loading books.
The bible has many translation. The translations I am using for the analytics
is based on the content I extract from the Internet. 

The bible is organized into books and stored in text files. The name of each
file is in this format: `{book-name}-text.txt`. The `{book-name}` starts with
a number from `01` to `73` and follows this format: `{book-number}-name`. For example,
the book of `Genesis` would be `01-Genesis`. The book of `Revelation` is named
as `73-Revelation`. The file name for the book of `Genesis` would be 
`01-Geneisis-text.txt` and for the book of `Revelation` `73-Revelation-text.txt`

The content of each book follows this format:

line 1: `*** the book of {book-name}`. The book-name is the actual name of the book. 
For example, `Genesis` or `Revelation`. There is no leading number as indicated
above in the file name.

line 2: empty

line 3: `*** {book-name} {chapter-no}`. This is true for RSV, NAB and LV. For the 
case of DR, `Chapter ` is in place of the `{book-name}`.

line 4: empty

line 5: first verse of the first chapter of the book. The verse is stored in a 
`single` line of text leading by verse number starting from 1.

... 

Each subsequent chapter follows the same format as that of the first chapter 
indicated above starting at line through line 5.

### Example:


```
*** the book of Genesis

*** Genesis 1 ***

1 In the beginning God created the heavens and the earth.
2 The earth was without form and void, and darkness was upon the face of the deep; and the Spirit of God was moving over the face of the waters.
```
....

```
31 And God saw everything that he had made, and behold, it was very good. And there was evening and there was morning, a sixth day.

*** Genesis 2 ***

1 Thus the heavens and the earth were finished, and all the host of them.
2 And on the seventh day God finished his work which he had done, and he rested on the seventh day from all his work which he had done.
```


In [None]:
# Empty the array that contains all the verses.

verses = []

In [None]:
# Loading first book for 'DR' translation

book_name       = '01-Genesis'
book_filename   = f'{book_name}{filename_suffix}'
load_book(verses, dr_translation, hdfs_dir, book_filename)


In [None]:
# Making a Pandas data frame from the array of verses.
pd_df = spark.createDataFrame(verses).toPandas()
pd_df.columns = verse_column_names

# Display pandas dataframe on screen ...
pd_df

In [None]:
# Loading the second book ...

book_name       = '02-Exodus'
book_filename   = f'{book_name}{filename_suffix}'
load_book(verses, dr_translation, hdfs_dir, book_filename)


In [None]:
# Making a Pandas data frame from the array of verses.
pd_df = spark.createDataFrame(verses).toPandas()
pd_df.columns = verse_column_names

# Display pandas dataframe on screen ...
pd_df

## Storing Pandas data frame into CSV file:
We are converting a pandas dataframe that is a set of `Row's` int a 
CSV file. The CSV file has a header defined in the array of strings
`verse_column_names`. Initially it would be this:

```python
column_translation_name = 'Translation'
column_book_name        = 'Book Name'
column_chapter_name     = 'Chapter'
column_verse_name       = 'Verse'
column_text_name        = 'Text'
verse_column_names      = [column_translation_name, column_book_name, \
                           column_chapter_name, column_verse_name, \
                           column_text_name]
```

In [None]:
# Try to store the content in a temporary folder in csv format.
dr_csv_filename = get_csv_filename(tmp_dir, dr_translation)
pd_df.to_csv(dr_csv_filename, index=False)
print_wc(dr_csv_filename)

In [None]:
# Load the entire DR bible
pd_dr = load_the_bible(hdfs_dir, dr_translation)

In [None]:
# Try to store the content in a temporary folder in csv format.
dr_csv_filename = get_csv_filename(tmp_dir, dr_translation)
pd_dr.to_csv(dr_csv_filename, index=False)
print_wc(dr_csv_filename)

In [None]:
pd_dr

In [None]:
# Load the RSV bible
pd_rsv = load_the_bible(hdfs_dir, rsv_translation)

In [None]:
# Try to store the content in a temporary folder in csv format.
rsv_csv_filename = get_csv_filename(tmp_dir, rsv_translation)
pd_rsv.to_csv(rsv_csv_filename, index=False)
print_wc(rsv_csv_filename)

In [None]:
pd_rsv


In [None]:
# Load the four translations of the bible
for translation in translations:
    print(f'loading {translation} ...')
    pd = load_the_bible(hdfs_dir, translation)
    pd.to_csv(get_csv_filename(tmp_dir, translation), index=False)
    

In [None]:
info_array = []
for translation in translations:
    filename = get_csv_filename(tmp_dir, translation)
    with subprocess.Popen(["wc", filename], stdout=subprocess.PIPE) as proc:
        file_info = str(proc.stdout.read()).replace('b\'', '')\
                                           .strip()\
                                           .replace('\\n\'', '')\
                                           .replace('\t','')
    # Removing extra spaces between words ... then split by space    
    file_info = re.sub(' +', ' ', file_info).split(' ')
    info_array.append(file_info)

pd_info = spark.createDataFrame(info_array).toPandas()
pd_info.columns = ['Lines', 'Words', 'Characters', 'File Name']
pd_info

## Load the bible of a given translation from a file.
For each translation, we stored the entire bible on a CSV file
where each row is a verse in a book. The row consists of 
the name of the translation (csv, lv, dr, nab), 
the name of the book (73 books in total), the chapter
number, the verse number and the actual text of the verse.

In [None]:
pd_rsv = load_csv_into_pandas(hdfs_dir, rsv_translation)
pd_rsv

In [None]:
rsv_books = pd_rsv.groupby([column_book_name])

In [None]:
rsv_chapters = pd_rsv.groupby([column_book_name, column_chapter_name])

In [None]:
rsv_books.count()

In [None]:
rsv_chapters.count()

## Organizing bible by books.
the_bible is a dictionary of book names. Each book
is an array of chapters. Since array is indexing at zero (0)
and the chapters are numbered from 1, let's try 
to starting at 1. but that leave us an extra slot at zero (0)
and causes a bit complication in counting that is to remember
to substract 1. This is too messy.

Let's uses dictionary all the way.
the_bible is a dictionary by book name. Each book is a
dictionary of chapters (starting 1). Each chapter is 
a dictionary of verses (starting 1).

In [None]:
the_bible = pandas_as_dictionary(pd_rsv)

In [None]:
the_bible

In [None]:
the_bible['01-Genesis'][1][1], the_bible['73-Revelation'][22][21]

In [None]:
len(the_bible), len(the_bible['01-Genesis']), len(the_bible['73-Revelation'][22])

In [None]:
the_bible_in_json = json.dumps(the_bible, indent=4)

In [None]:
with open('rsv.json', 'w') as writer:
    writer.write(the_bible_in_json)

In [None]:
## Converting the content of the bible from csv format to json format
for translation in translations:
    print(f'converting from csv to json for the translation: {translation} ...')
    convert_csv_to_json(hdfs_dir, tmp_dir, translation)

## Starts Natural Language Took Kit here ...
In this section, we are download the punkt and stopwords

In [None]:
# download nltk modules.
nltk.download('punkt')
nltk.download('stopwords')
detector = nltk.data.load('tokenizers/punkt/english.pickle')

### Exercise 1:
In this section, we assume that the_bible object is a dictionary of books of dictionary 
of chapters of dictionary of verses. 
For each book, we would like to know:

- number of chapters
- number of verses
- number of sentences
- number of words
- number of words without stopwords (i.e., 'the', 'an', 'a', etc.)
- number of unique words without stopwords



In [None]:
def get_book_text(the_bible, the_book_name):
    the_book_text = []
    the_book = the_bible[the_book_name]

    for chapter in the_book:
        for verse in the_book[chapter]:
            the_book_text.append(the_book[chapter][verse])

    return the_book_text           

the_book_name = '01-Genesis'
the_book_text = get_book_text(the_bible, the_book_name)
the_book_text

In [None]:
# Merge all verses into a long string of text for the current book.
the_book_tokens = word_tokenize(' '.join(the_book_text))
len(the_book_tokens), the_book_tokens[43545]

In [None]:
sentences = detector.tokenize(' '.join(the_book_text).strip())
print(f'{the_book_name} has {len(sentences)} sentences')
sentences

In [None]:
# Load the RSV bible
pd_rsv = load_the_bible(hdfs_dir, rsv_translation)

In [None]:
## Make the bible into a long string
the_text = ''
for index, row in pd_rsv.iterrows():
    the_text += ' ' + row ['Text']

the_text

In [None]:
# Tokenize the string
the_tokens = word_tokenize(the_text)
the_tokens

In [None]:
# Converting tokens into NLTK Text object
nltk_text = nltk.Text(the_tokens)
nltk_text

In [None]:
# Try a few things with nltk_text.

nltk_text.collocations()

In [None]:
nltk_text.concordance('Adam')

In [None]:
nltk_text.concordance('Jesus')

In [None]:
vip = ['God', 'Jesus', 'Adam', 'Eve', 'Abraham', 'Moses', 'Saul', 'David', 'Joseph', 'Mary', 'Peter', 'Paul', 'John']
mention = []
for person in vip:
    mention.append([person, nltk_text.count(person)])
    
pd_mention = spark.createDataFrame(mention).toPandas()
pd_mention.columns = ['Person', 'Mentioned']
pd_mention
# nltk_text.count('Jesus'), nltk_text.count('Adam'), nltk_text.count('Saul'), \
# nltk_text.count('David'), nltk_text.count('Joseph'), nltk_text.count('Mary'), \
# nltk_text.count('John')

### Scratch notebook starts here ...

In [None]:
rsv_df = load_book(rsv_translation, book_name)
rsv_df.groupby([column_chapter_name])[column_verse_name].count()

In [None]:
nab_df = load_book(nab_translation, book_name)
nab_df.groupby([column_chapter_name])[column_verse_name].count()

In [None]:
lv_df = load_book(lv_translation, book_name)
lv_df.groupby([column_chapter_name])[column_verse_name].count()

In [None]:
dr_df.count(), rsv_df.count(), nab_df.count(), lv_df.count()

In [None]:
"""
filter out metadata:  Like the following text:

*** the book of Genesis

*** Genesis 1 ***

1 In the beginning God created the heavens and the earth.

The first two lines should be excluded. The heading number of each 
line indicating the verse number should also be excluded.

Use only line that starts with a digit in the first character.
"""
def verses_only(raw):
    return raw.filter(lambda line: line and line[0].isdigit())

"""
Remove the verse number on each line.
"""
def words_only(verses):
    return verses.map(lambda line: str(line[line.find(' ')+1:]))

In [None]:
# Reading contents from files.
dr_rdd  = read_file(dr_book_filename)
rsv_rdd  = read_file(rsv_book_filename)


In [None]:
# book_name | chapter_no | verse_no | text|
# skip until *** is found.

dr_cv_rdd = chapters_verses(dr_rdd)

verses = []
chapter_no = 0
verse_no = 0
for line in dr_cv_rdd.collect():
    if line.startswith('***'):
        chapter_no += 1
        verse_no = 0
    else:
        if line[0].isdigit():
            verse_no += 1
            verses.append([book_name, chapter_no, verse_no, line[line.find(' ')+1:]])
    

In [None]:

dr_df = spark.createDataFrame(verses)
pd_df.columns = ['Book Name', 'Chapter', 'Verse', 'Text', 'Words', 'Sentences']

In [None]:
dr_df.toPandas()

In [None]:
# Extract verses only (include verse # in the begining of each line). Eac line is a verse.
dr_verses = verses_only(dr_rdd)
rsv_verses = verses_only(rsv_rdd)

In [None]:
dr_verses.collect()

In [None]:
# Extract words only (eclude the # in the begining of each line)
dr_words_only = words_only(dr_verses)
rsv_words_only = words_only(rsv_verses)

In [None]:
print (f'There are {len(dr_words_only.collect())}/{len(rsv_words_only.collect())} verses')

In [None]:
len(dr_words_only.collect()), len(rsv_words_only.collect())

In [None]:
# Making the entire book into a string of words ... Then tokenize it using NLTK.
dr_words = '\n'.join(word for word in list(dr_words_only.collect()))
rsv_words = '\n'.join(word for word in list(rsv_words_only.collect()))

dr_tokens = word_tokenize(dr_words)
rsv_tokens = word_tokenize(rsv_words)

In [None]:
# Making into NLTK objects for processing.
dr_text = nltk.Text(dr_tokens)
rsv_text = nltk.Text(rsv_tokens)

In [None]:
print(dr_words)

In [None]:
len(dr_text), len(rsv_text)

In [None]:
# find in string returns # instances of all the substrings
dr_words.find('Eve'), rsv_words.find('Eve'), dr_words.find('heaven'), rsv_words.find('heaven')

In [None]:
dr_text.count('Adam')

In [None]:
dr_text.count('Eve'), dr_text.count('heaven')

In [None]:
dr_text.similar('living')

In [None]:
dr_text.common_contexts('cattle', 'of')

In [None]:
dr_text.generate()

In [None]:
sorted(set(dr_text))

In [None]:
# set is a list of vocabulary items (unique words). 
# How many unique/distinct words in this book?
len(set(dr_text))

In [None]:
# collocation is a sequence of words that occur together unusually often.
dr_text.collocations()

In [None]:
# concordance permits us to see words in context
dr_text.concordance('Eve')

In [None]:
# bigrams is a list of word pairs.
len(list(nltk.bigrams(dr_tokens)))

In [None]:
fdist = FreqDist(dr_text)
fdist

In [None]:
fdist.keys()

In [None]:
fdist

In [None]:
fdist.freq

In [None]:
dr_text.count('created')

In [None]:
len(dr_text) / len(set(dr_text))

In [None]:
100 * dr_text.count('Joseph')/len(dr_text)

In [None]:
dr_text.index('Adam')

In [None]:
V = set(dr_text)
long_words = [w for w in V if len(w) > 10]
sorted(long_words)