In [1]:
print sc

<pyspark.context.SparkContext object at 0x7ff7319cd150>


### Load Shakespeare dataset

In [2]:
shakespeareRawRDD = sc.textFile('hdfs:///user/gmedasani/data/exercises-data/shakespeare.txt',8)

In [3]:
shakespeareRawRDD.take(10)

[u'1609',
 u'',
 u'THE SONNETS',
 u'',
 u'by William Shakespeare',
 u'',
 u'',
 u'',
 u'                     1',
 u'  From fairest creatures we desire increase,']

### Remove Punctuation

In [4]:
import re
def removePunctuation(text):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        text (str): A string.

    Returns:
        str: The cleaned up string.
    """
    return ('').join(re.findall(r'[^\'?:._!,\(\)\[\];\"-\--\/\}\{]+',text.lower().strip()))
print removePunctuation('Hi, you!')
print removePunctuation(' No under_score!')

hi you
no underscore


In [5]:
from test_helper import Test
Test.assertEquals(removePunctuation(" The Elephant's 4 cats. "),
                  'the elephants 4 cats',
                  'incorrect definition for removePunctuation function')

1 test passed.


In [6]:
shakespeareRDD = (shakespeareRawRDD
                  .map(removePunctuation))
print '\n'.join(shakespeareRDD
                .zipWithIndex()  # to (line, lineNum)
                .map(lambda (l, num): '{0}: {1}'.format(num, l))  # to 'lineNum: line'
                .take(15))

0: 1609
1: 
2: the sonnets
3: 
4: by william shakespeare
5: 
6: 
7: 
8: 1
9: from fairest creatures we desire increase
10: that thereby beautys rose might never die
11: but as the riper should by time decease
12: his tender heir might bear his memory
13: but thou contracted to thine own bright eyes
14: feedst thy lights flame with selfsubstantial fuel


### Words from lines

In [7]:
shakespeareWordsRDD = shakespeareRDD.flatMap(lambda line: line.split(' '))
shakespeareWordCount = shakespeareWordsRDD.count()
print shakespeareWordsRDD.top(5)
print shakespeareWordCount

[u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds']
928908


In [8]:
Test.assertTrue(shakespeareWordCount == 927631 or shakespeareWordCount == 928908,
                'incorrect value for shakespeareWordCount')
Test.assertEquals(shakespeareWordsRDD.top(5),
                  [u'zwaggerd', u'zounds', u'zounds', u'zounds', u'zounds'],
                  'incorrect value for shakespeareWordsRDD')

1 test passed.
1 test passed.


### Remove empty elements

In [9]:
shakeWordsRDD = shakespeareWordsRDD.filter(lambda word: word != '')
shakeWordCount = shakeWordsRDD.count()
print shakeWordCount

882996


In [10]:
Test.assertEquals(shakeWordCount, 882996, 'incorrect value for shakeWordCount')

1 test passed.


### Calculate word counts and list top 15 words

In [17]:
top15WordsAndCounts = (shakeWordsRDD.map(lambda word: (word,1))
                       .reduceByKey(lambda v1,v2: v1+v2)
                       .takeOrdered(15,key = lambda (k,v):(-v,k))
                       )
print '\n'.join(map(lambda (w, c): '{0}: {1}'.format(w, c), top15WordsAndCounts))

the: 27361
and: 26028
i: 20681
to: 19150
of: 17463
a: 14593
you: 13615
my: 12481
in: 10956
that: 10890
is: 9134
not: 8497
with: 7771
me: 7769
it: 7678


In [18]:
Test.assertEquals(top15WordsAndCounts,
                  [(u'the', 27361), (u'and', 26028), (u'i', 20681), (u'to', 19150), (u'of', 17463),
                   (u'a', 14593), (u'you', 13615), (u'my', 12481), (u'in', 10956), (u'that', 10890),
                   (u'is', 9134), (u'not', 8497), (u'with', 7771), (u'me', 7769), (u'it', 7678)],
                  'incorrect value for top15WordsAndCounts')

1 test passed.
