# Word Count Example

In [1]:
# import Pyspark libraries 
import findspark 
findspark.init()
from pyspark.sql import SparkSession 
from pyspark.conf import SparkConf 

# Here is defined Spark begining point
spark = SparkSession.builder.master("local[4]")\
.appName("Wordcount_Rdd")\
.getOrCreate()

# Here is defined SparkContext which uses for accessing to cluster
sc = spark.sparkContext

# load txt data file
data = sc.textFile("data/HanselStory.txt")

# row number is counted from txt file
print("Story Row Number: ",data.count())
# 5 row is taken from story txt file
print("\n-- Take first 5 row ---\n",data.take(5))

Story Row Number:  197

-- Take first 5 row ---
 ['Once upon a time there dwelt on the outskirts of a', 'large forest a poor woodcutter with his wife and two', 'children; the boy was called Hansel and the girl Grettel.', 'He had always little enough to live on, and once, when', "there was a great famine in the land, he couldn't even"]


In [2]:
# Words are splitted
words = data.flatMap(lambda row: row.split(" "))
print("--- Splitted Words ---\n",     words.take(10))

--- Splitted Words ---
 ['Once', 'upon', 'a', 'time', 'there', 'dwelt', 'on', 'the', 'outskirts', 'of']


In [3]:
# Words are mapped and showed key and it length 
words_number = words.map(lambda word: (word,1))
print(words_number.take(5))

[('Once', 1), ('upon', 1), ('a', 1), ('time', 1), ('there', 1)]


In [4]:
# Mapped data is reduced by key and count for calculating word number
words_number_RBW = words_number.reduceByKey(lambda key,count: key+count)
words_number_RBW.take(10)

[('Once', 1),
 ('upon', 1),
 ('there', 4),
 ('dwelt', 1),
 ('outskirts', 1),
 ('of', 31),
 ('large', 1),
 ('poor', 4),
 ('his', 18),
 ('two', 3)]

In [5]:
# Reduced data is mapped again for sorting by descending word number
words_number_RBW2 = words_number_RBW.map(lambda key: (key[1], key[0]))
words_number_RBW2.take(5)
words_number_RBW2.sortByKey(False).take(20)

[(113, 'the'),
 (91, 'and'),
 (44, 'to'),
 (42, 'a'),
 (34, 'they'),
 (31, 'of'),
 (27, 'had'),
 (19, 'was'),
 (19, 'in'),
 (19, 'he'),
 (19, 'on'),
 (18, 'his'),
 (17, 'Hansel'),
 (17, 'their'),
 (16, 'for'),
 (15, 'them'),
 (15, 'you'),
 (13, 'it'),
 (12, 'when'),
 (12, '')]