In [15]:
# map vs. flatMap

# map transformation applies a function to each row in a DataFrame/Dataset and returns the new transformed Dataset.
# 1 => 1
# flatMap transformation flattens the DataFrame/Dataset after applying the function on every element and returns a new transformed Dataset. 
# The returned Dataset will return more rows than the current DataFrame. It is also referred to as a one-to-many transformation function
# 1 => Many
# One of the use cases of flatMap() is to flatten column which contains arrays, list, or any nested collection

import pyspark

sc = pyspark.SparkContext.getOrCreate();
rdd = sc.parallelize([("name", "joe,sarah,tom"), ("car", "hyundai")])
result = rdd.map(lambda x: x[1].split(","))
# print(result.collect())
# [['joe', 'sarah', 'tom'], ['hyundai']]

rdd = sc.parallelize([("name", "joe,sarah,tom"), ("car", "hyundai")])
result = rdd.flatMap(lambda x: x[1].split(","))
# print(result.collect())
# ['joe', 'sarah', 'tom', 'hyundai']


test_file = "file:///home/jovyan/work/sample/lorem_ipsum.txt"
lines = sc.textFile(test_file)
words = lines.flatMap(lambda x: x.split())
# word_count = words.countByValue()
# print(word_count)
# for word, count in word_count.items():
#     print(f"{word}: {count}")
    
    
# # How about sort by key?
word_count = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
sorted_word_count = word_count.map(lambda x: (x[1], x[0])).sortByKey()
for word, count in sorted_word_count.collect():
    print(f"{word}: {count}")

1: is
1: printing
1: typesetting
1: industry's
1: ever
1: 1500s,
1: when
1: an
1: unknown
1: took
1: galley
1: make
1: book.
1: only
1: but
1: into
1: typesetting,
1: remaining
1: essentially
1: was
1: in
1: 1960s
1: passages,
1: more
1: desktop
1: like
1: Aldus
1: PageMaker
1: versions
1: simply
1: industry.
1: been
1: standard
1: since
1: printer
1: scrambled
1: it
1: to
1: specimen
1: survived
1: not
1: five
1: centuries,
1: also
1: leap
1: electronic
1: unchanged.
1: popularised
1: release
1: Letraset
1: sheets
1: containing
1: recently
1: publishing
1: software
1: including
1: Ipsum.
2: dummy
2: type
2: It
2: text
2: has
2: a
2: with
3: Ipsum
3: and
4: Lorem
4: of
6: the
