# Spark - Word Count Example in Jupyter Notebook

In [10]:
from pyspark import SparkContext
sc = SparkContext("local", "WordCount")
sc

In [11]:
# Read text data and create an RDD
lines = sc.textFile("..\data\Alices-Adventures-in-Wonderland-by-Lewis-Carroll.txt.bz2")

# First line is the header. 
lines.first()

'The Project Gutenberg EBook of Alice’s Adventures in Wonderland, by Lewis Carroll'

In [3]:
lines.take(5)

['The Project Gutenberg EBook of Alice’s Adventures in Wonderland, by Lewis Carroll',
 '',
 'This eBook is for the use of anyone anywhere in the United States and most',
 'other parts of the world at no cost and with almost no restrictions',
 'whatsoever.  You may copy it, give it away or re-use it under the terms of']

In [4]:
# Tokenize into words
words = lines.flatMap(lambda line: line.split(" "))

# Check the results 
words.take(10)

['The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland,',
 'by']

In [5]:
# Perform word mapping
word_counts = words.map(lambda word: (word, 1))    

# Check word mapping results
word_counts.take(20)

[('The', 1),
 ('Project', 1),
 ('Gutenberg', 1),
 ('EBook', 1),
 ('of', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland,', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1)]

In [6]:
# Perform word reducing
word_counts = word_counts.reduceByKey(lambda a, b: a + b)    

# Check word reducing results
word_counts.take(20)

[('The', 107),
 ('Project', 79),
 ('Gutenberg', 21),
 ('EBook', 1),
 ('of', 604),
 ('Alice’s', 14),
 ('Adventures', 5),
 ('in', 408),
 ('Wonderland,', 3),
 ('by', 81),
 ('Lewis', 4),
 ('Carroll', 4),
 ('', 1624),
 ('This', 20),
 ('eBook', 5),
 ('is', 86),
 ('for', 146),
 ('the', 1682),
 ('use', 24),
 ('anyone', 4)]

# Get top 20 words 

In [7]:
# Take the top 20 used words
top_20_word_counts = word_counts.top(20, lambda word: word[1])

top_20_word_counts

[('the', 1682),
 ('', 1624),
 ('and', 787),
 ('to', 778),
 ('a', 667),
 ('of', 604),
 ('she', 485),
 ('said', 416),
 ('in', 408),
 ('it', 357),
 ('was', 329),
 ('you', 305),
 ('I', 249),
 ('as', 246),
 ('that', 226),
 ('Alice', 221),
 ('with', 215),
 ('at', 209),
 ('her', 204),
 ('had', 176)]

# Get bottom 20 words 

In [8]:
# Take the bottom 20 used words
bottom_20_word_counts = word_counts.takeOrdered(20, lambda word: word[1])

bottom_20_word_counts

[('EBook', 1),
 ('Title:', 1),
 ('Author:', 1),
 ('Release', 1),
 ('Date:', 1),
 ('June', 1),
 ('25,', 1),
 ('2008', 1),
 ('[EBook', 1),
 ('#11]', 1),
 ('[Most', 1),
 ('recently', 1),
 ('updated:', 1),
 ('October', 1),
 ('12,', 1),
 ('2020]', 1),
 ('Language:', 1),
 ('Character', 1),
 ('encoding:', 1),
 ('UTF-8', 1)]

In [9]:
# Stop Spark context
sc.stop()