# Spark - Word Count Example in jupyter Notebook

In [1]:
from pyspark import SparkContext
sc = SparkContext()
sc

In [2]:
# First we read all of the lines into an RDD. 
lines = sc.textFile("./data/Alices-Adventures-in-Wonderland-by-Lewis-Carroll.txt.bz2")

# First line is the header. 
lines.first()

'The Project Gutenberg EBook of Alice’s Adventures in Wonderland, by Lewis Carroll'

In [3]:
lines.take(4)

['The Project Gutenberg EBook of Alice’s Adventures in Wonderland, by Lewis Carroll',
 '',
 'This eBook is for the use of anyone anywhere in the United States and most',
 'other parts of the world at no cost and with almost no restrictions']

In [4]:
words = lines.flatMap(lambda x: x.split(' '))

# Take two of these to know what is inside them 
words.take(2)

['The', 'Project']

In [5]:
wordsAsTuples = words.map(lambda x: (x, 1))

wordsAsTuples.take(1)

[('The', 1)]

In [6]:
counts=wordsAsTuples.reduceByKey(lambda x, y: x+y)

counts.take(1)

[('The', 107)]

# Get top 20 Words sorted. 

In [7]:
counts.top(20, lambda x: x[1])

[('the', 1682),
 ('', 1624),
 ('and', 787),
 ('to', 778),
 ('a', 667),
 ('of', 604),
 ('she', 485),
 ('said', 416),
 ('in', 408),
 ('it', 357),
 ('was', 329),
 ('you', 305),
 ('I', 249),
 ('as', 246),
 ('that', 226),
 ('Alice', 221),
 ('with', 215),
 ('at', 209),
 ('her', 204),
 ('had', 176)]

In [8]:
counts.filter(lambda x: x[0]!='').top(20, lambda x: x[1])

[('the', 1682),
 ('and', 787),
 ('to', 778),
 ('a', 667),
 ('of', 604),
 ('she', 485),
 ('said', 416),
 ('in', 408),
 ('it', 357),
 ('was', 329),
 ('you', 305),
 ('I', 249),
 ('as', 246),
 ('that', 226),
 ('Alice', 221),
 ('with', 215),
 ('at', 209),
 ('her', 204),
 ('had', 176),
 ('all', 169)]