<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/misc/word_count.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word Count

# Setting up PySpark

In [None]:
%pip install pyspark



In [31]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()

sc = spark.sparkContext

In [4]:
!mkdir /content/files/
text = "these are words \
these are more words \
words in english"

text_file = open("/content/files/some_words.txt", "w")
text_file.write(text)
text_file.close()

In [12]:
rdd1 = sc.textFile("/content/files/some_words.txt")

# check how many partitions
rdd1.getNumPartitions()

1

In [23]:
# increase partitions
rdd2 = text_file.repartition(3)
rdd2.getNumPartitions()

3

In [24]:
# flatMap - Each item in input can be 0 or more items in output (returns a Seq)
# "these are words" -> split by " " -> ["there", "are", "words"]
rdd3 = rdd2.flatMap(lambda line: line.split(" "))

# send data to driver if cluster
rdd3.collect()

['these',
 'are',
 'words',
 'these',
 'are',
 'more',
 'words',
 'words',
 'in',
 'english']

In [25]:
# map - Each item in input returns one item in output
# "these" -> ("these", 1)
rdd4 = rdd3.map(lambda word: (word, 1))
rdd4.collect()

[('these', 1),
 ('are', 1),
 ('words', 1),
 ('these', 1),
 ('are', 1),
 ('more', 1),
 ('words', 1),
 ('words', 1),
 ('in', 1),
 ('english', 1)]

In [37]:
# reduceByKey - When called on a dataset of (K, V) pairs, returns a dataset of (K, V) pairs where the values for each key are
# aggregated using the given reduce function func, which must be of type (V,V) => V.

rdd5 = rdd4.reduceByKey(lambda a, b: a + b)
sorted(rdd5.collect())

[('are', 2),
 ('english', 1),
 ('in', 1),
 ('more', 1),
 ('these', 2),
 ('words', 3)]

In [35]:
# another way to count the words - using method "add"
from operator import add
sorted(rdd4.reduceByKey(add).collect())

[('are', 2),
 ('english', 1),
 ('in', 1),
 ('more', 1),
 ('these', 2),
 ('words', 3)]

In [53]:
# putting all commands together

counts = (
    rdd1.flatMap(lambda line: line.split(" "))
    .map(lambda word: (word, 1))
    .reduceByKey(lambda a, b: a + b)
    .sortByKey()
)

counts.collect()

[('these', 2),
 ('are', 2),
 ('words', 3),
 ('more', 1),
 ('in', 1),
 ('english', 1)]

In [60]:
def wordCount(file_path: str, num_partitions: int):
  rdd1 = sc.textFile(file_path)
  counts = (
      rdd1.flatMap(lambda line: line.split(" "))
      .map(lambda word: (word, 1))
      .reduceByKey(lambda a, b: a + b, num_partitions)
      .sortByKey()
  )
  return counts

In [61]:
output = wordCount("/content/files/some_words.txt", 7)

In [62]:
type(output) # pyspark.rdd.PipelinedRDD

In [63]:
output.collect()

[('are', 2),
 ('english', 1),
 ('in', 1),
 ('more', 1),
 ('these', 2),
 ('words', 3)]

In [64]:
output.getNumPartitions()

7