### This project demonstrates text file processing and analysis using RDD. The text file used in this example is King James Version Bible

In [66]:
import re

In [67]:
k = sc.textFile("/Users/binggangliu/Downloads/Bible/kjv/KJV_c.txt", minPartitions=4)
k.count()

32423

In [69]:
k.take(5)

['',
 'Genesis',
 '',
 'Gen.1:1 In the beginning God created the heaven and the earth.',
 'Gen.1:2 And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.']

#### Remove empty lines

In [70]:

k1 = k.filter(lambda x: x != '')

In [71]:
k1.take(5)

['Genesis',
 'Gen.1:1 In the beginning God created the heaven and the earth.',
 'Gen.1:2 And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.',
 'Gen.1:3 And God said, Let there be light: and there was light.',
 'Gen.1:4 And God saw the light, that it was good: and God divided the light from the darkness.']

Total number of lines:

In [72]:
k1.count()

31168

#### Mapping with splitting

In [73]:
def split(lines):
    lines = lines.split()
    return lines

k1_split = k1.map(split)

In [593]:
#k1_split.take(3)

In [75]:
k1_split.count()

31168

since map() function does one-to-one mapping, total number of lines are still the same

In [76]:
k1_split_flat = k1.flatMap(split)

In [594]:
#k1_split_flat.take(15)

flatMap() does one-to-many mapping and hence each word is made a indepenent line

In [381]:
k1_split_flat.count()

823800

#### Delete the extra words (the title of each section or each verse)

In [None]:
#pattern = re.compile(r'\.(\d|\d\d|\d\d\d):')
#kjv = k1_lower.filter(lambda x: pattern.search in x)

The above approach needs to be improved in order to work properly

In [545]:
def find_extra(word):
    pattern = re.compile(r'\.(\d|\d\d|\d\d\d):')
    match = pattern.findall(word)
    if match:
        return word

In [546]:
kjv_titles = k1_split_flat.filter(lambda x: find_extra(x))

In [915]:
kjv_titles.take(5)

['Gen.1:1', 'Gen.1:2', 'Gen.1:3', 'Gen.1:4', 'Gen.1:5']

Total number of sections (verses):

In [548]:
kjv_titles.count()

31102

#### All the extra words (titles for sections or verses) in the Bible have been removed at this point

In [386]:
kjv = k1_split_flat.filter(lambda x: not find_extra(x))

In [954]:
kjv.take(5)

['Genesis', 'In', 'the', 'beginning', 'God']

In [388]:
kjv.count()

792698

In [335]:
#counts = kjv.map(lambda word: (word, 1)).reduceByKey(lambda a,b: a+b)

In [955]:
#counts.take(10)

In [389]:
kjv_grouped = kjv.map(lambda x: (x,1)).groupByKey()
word_frequency = kjv_grouped.mapValues(sum).map(lambda x: (x[1],x[0])).sortByKey(False)


In [957]:
word_frequency.take(5)

[(62058, 'the'), (38578, 'and'), (34391, 'of'), (13369, 'to'), (12736, 'And')]

In [591]:
word_frequency.coalesce(1).saveAsTextFile("/Users/binggangliu/Downloads/Bible/kjv/wordFrequencyOutput.txt")

#### Remove the punctuations

Now it needs to remove the punctuations, including the independent ones (already splitted), and the punctuations as a part of a word, including those at end of some words, such as  [, | . | : | ; | ? | ! |'| ( | 's| ] ,   and those in the beginning of a word, such as [ ' | ( ].

In [958]:
""""
def remove_punc(word):
    pattern = re.compile(r'\w(\.|,|;|:|\?|\'|\'s|\)|!)')
    matches = pattern.finditer(word)
    for match in matches:
        if word[-1] == 's':
            word = word[:-2]        
        else:
            word = word[:-1]
        return word
""""

In [979]:
#kjv_test = kjv.map(lambda x: remove_punc(x) if x.find(r'\w(\.|,|;|:|\?|\'|\'s|\)|!)') else x) 

The snippet above needs to be improved in order to work properly. Also, it will still need to define another function to remove puctuations at the start of words, including ' and (

In [981]:
#kjv_test.take(15)

In [735]:
#kjv_rp = kjv.map(lambda x: x.replace(r'\w(\.|,|;|:|\?|\'|\'s|\)|!)', '')) #replace() method does not work either

The approach down below approved to be a more effective and easier way to do, and it works!

In [886]:
kjv_rp = kjv.map(lambda x: x[:-1] if x.endswith('.') or x.endswith(',') or x.endswith(';') or x.endswith(':') or x.endswith('?') or x.endswith('!') or x.endswith(')') or x.endswith('\'') else x)

In [887]:
pcs_test1 = kjv_rp.filter(lambda x: x.endswith('\''))

In [982]:
#pcs_test1.take(3)

In [983]:
#pcs_test1.count()

In [890]:
kjv_rp2 = kjv_rp.map(lambda x: x[:-1] if x.endswith('.') or x.endswith(',') or x.endswith(';') or x.endswith(':') or x.endswith('?') or x.endswith('!') or x.endswith(')') or x.endswith('\'') else x)

Running the above code a second time because there is an extra white space after the punctuations at end of some words

In [891]:
pcs_test2 = kjv_rp2.filter(lambda x: x.endswith('.'))

In [951]:
#pcs_test2.take(3)

In [940]:
pcs_find = kjv_rp2.filter(lambda x: x.startswith('('))

In [949]:
#pcs_find.count()

In [950]:
#pcs_find.take(1)

In [896]:
kjv_rp3 = kjv_rp2.map(lambda x: x[:-2] if x.endswith('\'s') else x)

In [897]:
pcs_test3 = kjv_rp3.filter(lambda x: x.endswith('\'s'))

In [952]:
#pcs_test3.count()

In [899]:
pcs_find = kjv_rp3.filter(lambda x: x.startswith('\''))

In [953]:
#pcs_find.count()

In [901]:
kjv_rp4 = kjv_rp3.map(lambda x: x[1:] if x.startswith('\'') or x.startswith('(') else x)

In [902]:
pcs_test4 = kjv_rp4.filter(lambda x: x.endswith('!'))

In [903]:
pcs_test4.count()

0

In [904]:
kjv_rp4.take(1)

['Genesis']

Remove empty strings generated during transformation esp during the punctuation removal process

In [907]:
kjv_f = kjv_rp4.filter(lambda x: x != '')

Total number of distinct words: 

In [920]:
kjv_f.distinct().count()

13540

Total number of words:

In [922]:
kjv_f.count()

789721

In [912]:
kjv_f_grouped = kjv_f.map(lambda x: (x,1)).groupByKey()
word_freq = kjv_f_grouped.mapValues(sum).map(lambda x: (x[1],x[0])).sortByKey(False)

In [924]:
word_freq.take(5)

[(62064, 'the'), (38850, 'and'), (34426, 'of'), (13382, 'to'), (12847, 'And')]

In [914]:
word_freq.coalesce(1).saveAsTextFile("/Users/binggangliu/Downloads/Bible/kjv/wordFreq4Output.txt")

#### Convert all words to lower case except for 'God', 'Lord', and 'Jesus'.

In [925]:
kjv_pl = kjv_f.map(lambda x: x.lower() if x not in ['God', 'Lord', 'Jesus'] else x) 

In [984]:
#kjv_pl.take(10)

In [942]:
kjv_pl_grouped = kjv_pl.map(lambda x: (x,1)).groupByKey()
word_freq = kjv_pl_grouped.mapValues(sum).map(lambda x: (x[1],x[0])).sortByKey(False)

In [943]:
word_freq.coalesce(1).saveAsTextFile("/Users/binggangliu/Downloads/Bible/kjv/wordFreqPartialLower.txt")

In [985]:
kjv_pl_grouped = kjv_pl.map(lambda x: (x,1)).groupByKey()
word_freq = kjv_pl_grouped.mapValues(sum).map(lambda x: (x[0],x[1])).sortByKey(True)

In [986]:
word_freq.coalesce(1).saveAsTextFile("/Users/binggangliu/Downloads/Bible/kjv/wordFreqPartialLowerSortedByKey.txt")

#### All words in lower case

In [944]:
kjv_l = kjv_f.map(lambda x: x.lower()) 

In [945]:
kjv_l_grouped = kjv_l.map(lambda x: (x,1)).groupByKey()
word_freq = kjv_l_grouped.mapValues(sum).map(lambda x: (x[1],x[0])).sortByKey(False)

Word occurance counting is output and saved as a txt file in local drive

In [946]:
word_freq.coalesce(1).saveAsTextFile("/Users/binggangliu/Downloads/Bible/kjv/wordFreqAllLower.txt")