In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("UseMap").getOrCreate()

# List of names
data = ["Josefina", "Jasmin", "Jorjeana", "Jorge", "Pastisa", "Rosefta", "Kumaqr", "Katherine", "Keizny", "Kiwran"]
rdd = spark.sparkContext.parallelize(data)

rdd_map = rdd.map(lambda s: s + "_Moral")
result_map = rdd_map.collect()
print("Appending '_Moral' to each name using map()")
print(result_map)

Appending '_Moral' to each name using map()
['Josefina_Moral', 'Jasmin_Moral', 'Jorjeana_Moral', 'Jorge_Moral', 'Pastisa_Moral', 'Rosefta_Moral', 'Kumaqr_Moral', 'Katherine_Moral', 'Keizny_Moral', 'Kiwran_Moral']


In [2]:

rdd_filter = rdd_map.filter(lambda s: s.startswith('K'))

result_filter = rdd_filter.collect()

print("filter() This keep only names that start with 'K'")
print("result:", result_filter)

filter() This keep only names that start with 'K'
result: ['Kumaqr_Moral', 'Katherine_Moral', 'Keizny_Moral', 'Kiwran_Moral']


In [3]:

rdd_flatmap = rdd_filter.flatMap(lambda s: s)

result_flatmap = rdd_flatmap.collect()

print("Flatmap() Split the name into one char")
print("result:", result_flatmap)

Flatmap() Split the name into one char
result: ['K', 'u', 'm', 'a', 'q', 'r', '_', 'M', 'o', 'r', 'a', 'l', 'K', 'a', 't', 'h', 'e', 'r', 'i', 'n', 'e', '_', 'M', 'o', 'r', 'a', 'l', 'K', 'e', 'i', 'z', 'n', 'y', '_', 'M', 'o', 'r', 'a', 'l', 'K', 'i', 'w', 'r', 'a', 'n', '_', 'M', 'o', 'r', 'a', 'l']


In [4]:

rdd_distinct = rdd_flatmap.distinct()

result_distinct = rdd_distinct.collect()

print("distinct() transformation: Remove duplicate characters")
print("result:", result_distinct)


distinct() transformation: Remove duplicate characters
result: ['K', 'u', 'm', 'a', 'q', 'r', '_', 'M', 'o', 'l', 't', 'h', 'e', 'i', 'n', 'z', 'y', 'w']


In [5]:

rdd_sorted = rdd_distinct.sortBy(lambda s: s)

result_sorted = rdd_sorted.collect()

print("sortBy() transformation: Sort the distinct characters alphabetically")
print("result:", result_sorted)


sortBy() transformation: Sort the distinct characters alphabetically
result: ['K', 'M', '_', 'a', 'e', 'h', 'i', 'l', 'm', 'n', 'o', 'q', 'r', 't', 'u', 'w', 'y', 'z']
