In [0]:
# Example RDD
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Transformations

## `Map` Function

In [0]:
# 1. map
print("### 1. map ###")
print("Description: Return a new RDD by applying a function to all elements of this RDD.")

# Example 1: Multiply each element by 2
simple_map = rdd.map(lambda x: x * 2).collect()
print("01 map example (multiply by 2):", simple_map)

# Example 2: Extract the length of each word in a list of sentences
sentences = ["Hello world", "Apache Spark", "RDD transformations Wide Vs Narrow Spark"]
# Hello World => split (" ") => [(0)-> Hello, (1) -> World]
sentence_rdd = sc.parallelize(sentences)
words_map = sentence_rdd.map(lambda sentence: len(sentence.split(" "))).collect()
print("example_map example (word count in sentences):", words_map)

### 1. map ###
Description: Return a new RDD by applying a function to all elements of this RDD.
01 map example (multiply by 2): [2, 4, 6, 8, 10]
example_map example (word count in sentences): [2, 2, 6]


## `Filter` Function

In [0]:
# 2. filter
print("\n### 2. filter ###")
print("Description: Return a new RDD containing only the elements that satisfy a predicate.")

# 01 Example: Filter out even numbers
simple_filter = rdd.filter(lambda x: x % 2 == 0).collect()
print("01 filter example (even numbers):", simple_filter)

# example_Example: Filter sentences containing the word 'Spark'
words_filter = sentence_rdd.filter(lambda sentence: "Spark" in sentence).collect()
print("example_ filter example (sentences with 'Spark'):", words_filter)


### 2. filter ###
Description: Return a new RDD containing only the elements that satisfy a predicate.
01 filter example (even numbers): [2, 4]
example_ filter example (sentences with 'Spark'): ['Apache Spark', 'RDD transformations Wide Vs Narrow Spark']


## `FlatMap` Function

In [0]:
# 3. flatMap
print("\n### 3. flatMap ###")
print("Description: Return a new RDD by applying a function to all elements of this RDD and then flattening the results.")

# 01 Example: Split sentences into words
sentences_mapped = sentence_rdd.map(lambda sentence: sentence.split(" ")).collect()
print("01 sentences_mapped:", sentences_mapped)

simple_flatMap = sentence_rdd.flatMap(lambda sentence: sentence.split(" ")).collect()
print("02 flatMap example (split sentences into words):", simple_flatMap)

# example_Example: Flatten a list of lists
nested_lists = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
nested_rdd = sc.parallelize(nested_lists)
flatten_list = nested_rdd.flatMap(lambda x: x).collect()
print("flatten_list  flatMap example (flatten list of lists):", flatten_list)


### 3. flatMap ###
Description: Return a new RDD by applying a function to all elements of this RDD and then flattening the results.
01 sentences_mapped: [['Hello', 'world'], ['Apache', 'Spark'], ['RDD', 'transformations', 'Wide', 'Vs', 'Narrow', 'Spark']]
02 flatMap example (split sentences into words): ['Hello', 'world', 'Apache', 'Spark', 'RDD', 'transformations', 'Wide', 'Vs', 'Narrow', 'Spark']
flatten_list  flatMap example (flatten list of lists): [1, 2, 3, 4, 5, 6, 7, 8, 9]


## `Reduce` Function

In [0]:
# 4. reduce
print("\n### 4. reduce ###")
print("Description: Reduces the elements of this RDD using the specified commutative and associative binary operator.")

# 01 Example: Sum of elements
simple_reduce = rdd.reduce(lambda x, y: x + y)
print("01 reduce example (sum of elements):", simple_reduce)

# example_Example: Find the longest word in a list of words
words = ["cat", "elephant", "rat", "hippopotamus"]
words_rdd = sc.parallelize(words)
words_rdd_reduced = words_rdd.reduce(lambda x, y: x if len(x) > len(y) else y)
print("reduce example (longest word):", words_rdd_reduced)


### 4. reduce ###
Description: Reduces the elements of this RDD using the specified commutative and associative binary operator.
01 reduce example (sum of elements): 15
reduce example (longest word): hippopotamus


## `groupByKey` Function

In [0]:
# 5. groupByKey
print("\n### 5. groupByKey ###")
print("Description: Group the values for each key in the RDD into a single sequence.")

# 01 Example: Group numbers by even and odd
pairs = [(1, 'a'),(1, 'ali'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')]
pairs_rdd = sc.parallelize(pairs)
simple_groupByKey = pairs_rdd.groupByKey().mapValues(list).collect()
print("01 groupByKey example (group numbers):", simple_groupByKey)

# example_Example: Group words by their starting letter
words_pairs = [("cat", 1), ("car", 2), ("dog", 3), ("deer", 4), ("elephant", 5),("elephant", 20)]
words_rdd = sc.parallelize(words_pairs)
# mapValues(list) converts the grouped values (which are iterable) into lists.
words_grouped = words_rdd.groupByKey().mapValues(list).collect()
print("words_grouped example (group words by starting letter):", words_grouped)


### 5. groupByKey ###
Description: Group the values for each key in the RDD into a single sequence.
01 groupByKey example (group numbers): [(1, ['a', 'ali']), (2, ['b']), (3, ['c']), (4, ['d']), (5, ['e'])]
words_grouped example (group words by starting letter): [('elephant', [5, 20]), ('dog', [3]), ('cat', [1]), ('car', [2]), ('deer', [4])]


## `reduceByKey` Function

In [0]:
# 6. reduceByKey
print("\n### 6. reduceByKey ###")
print("Description: Merge the values for each key using an associative and commutative reduce function.")
pairs = [(1, 'a'),(1, '_a'), (2, 'b'), (2, '_b'), (3, 'c'), (4, 'd'), (5, 'e')]
pairs_rdd = sc.parallelize(pairs)

# 01 Example: Sum values with the same key
simple_reduceByKey = pairs_rdd.reduceByKey(lambda x, y: x + y).collect()
print("01 reduceByKey example (sum values by key):", simple_reduceByKey)

# example_Example: Count the occurrences of each word in a list
word_list = ["cat", "cat", "dog", "elephant", "dog", "dog"]
word_pairs_rdd = sc.parallelize(word_list).map(lambda word: (word, 1))
example__reduceByKey = word_pairs_rdd.reduceByKey(lambda x, y: x + y).collect()
print("example_ reduceByKey example (word count):", example__reduceByKey)


### 6. reduceByKey ###
Description: Merge the values for each key using an associative and commutative reduce function.
01 reduceByKey example (sum values by key): [(1, 'a_a'), (2, 'b_b'), (3, 'c'), (4, 'd'), (5, 'e')]
example_ reduceByKey example (word count): [('elephant', 1), ('dog', 3), ('cat', 2)]


## `join` Function

In [0]:
# 7. join
print("\n### 7. join ###")
print("Description: Perform an inner join of this RDD and another one.")

# 01 Example: Join two RDDs by key
fruits = sc.parallelize([(1, "apple"), (2, "banana")])
colors = sc.parallelize([(1, "red"), (2, "yellow")])
fruits_color_join = fruits.join(colors).collect()
print("01 join fruits_color_join (join two RDDs):", fruits_color_join)

# example_Example: Join employee data with department data
employees = sc.parallelize([(1, "John"), (2, "Jane"), (3, "Joe")])
departments = sc.parallelize([(1, "HR"), (2, "Finance")])
employees_department_join = employees.join(departments).collect()
print("join example (employee-department join):", employees_department_join)


### 7. join ###
Description: Perform an inner join of this RDD and another one.
01 join fruits_color_join (join two RDDs): [(1, ('apple', 'red')), (2, ('banana', 'yellow'))]
join example (employee-department join): [(1, ('John', 'HR')), (2, ('Jane', 'Finance'))]


## `cogroup` Function

TableA:

| id | value  |
|----|--------|
|  1 | apple  |
|  2 | banana |
|  3 | orange |


TableB:

| id | color  |
|----|--------|
|  1 | red    |
|  2 | yellow |


Result of cogroup:

| id | value  | color  |
|----|--------|--------|
|  1 | apple  | red    |
|  2 | banana | yellow |
|  3 | orange | NULL   |



In [0]:
# 8. cogroup
# The cogroup function in PySpark is used to group data from two RDDs that share the same key. 
# It combines the values of matching keys from both RDDs into a tuple of lists.
print("\n### 8. cogroup ###")
print("Description: Group data from two RDDs sharing the same key.")

# 01 Example: Cogroup two RDDs
fruits_rdd = sc.parallelize([(1, "apple"), (2, "banana"), (3, "orange")])
colors_rdd = sc.parallelize([(1, "red"), (2, "yellow")])
cogrouped_fruits_colors = fruits_rdd.cogroup(colors_rdd).mapValues(lambda x: (list(x[0]), list(x[1]))).collect()
print("01 cogroup example (group two RDDs):", cogrouped_fruits_colors)



# example_Example: Cogroup sales data with target data
sales_rdd = sc.parallelize([("store1", 100), ("store2", 200)])
targets_rdd = sc.parallelize([("store1", 150), ("store3", 250)])
cogrouped_sales_targets = sales_rdd.cogroup(targets_rdd).mapValues(lambda x: (list(x[0]), list(x[1]))).collect()
print("example_cogroup example (sales-targets cogroup):", cogrouped_sales_targets)



### 8. cogroup ###
Description: Group data from two RDDs sharing the same key.
01 cogroup example (group two RDDs): [(1, (['apple'], ['red'])), (2, (['banana'], ['yellow'])), (3, (['orange'], []))]
example_cogroup example (sales-targets cogroup): [('store2', ([200], [])), ('store3', ([], [250])), ('store1', ([100], [150]))]


## `distinct` Function

In [0]:
# 9. distinct
print("\n### 9. distinct ###")
print("Description: Return a new RDD containing the distinct elements in this RDD.")

# example_Example: Unique words from a list of words
words = ["cat", "dog", "cat", "elephant", "dog"]
words_rdd = sc.parallelize(words)
example__distinct = words_rdd.distinct().collect()
print("example_distinct example (unique words):", example__distinct)


### 9. distinct ###
Description: Return a new RDD containing the distinct elements in this RDD.
example_distinct example (unique words): ['elephant', 'dog', 'cat']


## `repartition` Vs. `coalesce` Function

In [0]:
from pyspark.sql.functions import col, expr
import random
import string
from datetime import datetime, timedelta

# Function to generate random log entry
def generate_log_entry():
    user_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
    action = random.choice(["login", "logout", "purchase", "click", "view"])
    item_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
    timestamp = (datetime.now() - timedelta(seconds=random.randint(0, 2592000))).strftime("%Y-%m-%d %H:%M:%S")
    return (user_id, action, item_id, timestamp)

# Generate synthetic data
log_entries = [generate_log_entry() for _ in range(1000000)]

# Create DataFrame
columns = ["user_id", "action", "item_id", "timestamp"]
log_df = spark.createDataFrame(log_entries, columns)

# Show sample data
log_df.show(10, truncate=False)

# Save to a CSV file in the DBFS (Databricks File System)
log_df.write.csv("/tmp/user_logs", header=True, mode="overwrite")


+--------+--------+-------+-------------------+
|user_id |action  |item_id|timestamp          |
+--------+--------+-------+-------------------+
|gn6o7l44|logout  |KGLZK  |2024-05-16 21:02:40|
|wtyzg6kd|purchase|2TM09  |2024-05-25 07:59:17|
|gw874pba|logout  |7FKCH  |2024-05-23 15:05:04|
|q58k6f2j|logout  |W1PV8  |2024-05-08 04:35:12|
|hqdfhtzv|click   |0JX6B  |2024-05-17 15:43:08|
|s39ffvi4|login   |34JRH  |2024-05-30 14:53:57|
|r2mza25t|login   |CGOJ8  |2024-05-19 11:08:39|
|krq030ed|click   |JED4X  |2024-06-03 03:36:20|
|agk3bhc2|click   |6RPA2  |2024-05-18 05:56:00|
|e9f4x48y|logout  |2VFBK  |2024-06-02 08:09:44|
+--------+--------+-------+-------------------+
only showing top 10 rows



## `repartition` Function

In [0]:
# 10. repartition
#https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/rdd/RDD.scala#L480
print("\n### 10. repartition ###")
print("Description: Return a new RDD that has exactly numPartitions partitions.")

logs_rdd = sc.textFile("/tmp/user_logs")

# Initial number of partitions
initial_partitions = logs_rdd.getNumPartitions()
print(f"Initial Partitions: {initial_partitions}")

# Repartition to 200 partitions
repartitioned_rdd = logs_rdd.repartition(100)
new_partitions = repartitioned_rdd.getNumPartitions()
print(f"New Partitions after Repartition: {new_partitions}")



### 10. repartition ###
Description: Return a new RDD that has exactly numPartitions partitions.
Initial Partitions: 8
New Partitions after Repartition: 100


## `coalesce` Function

In [0]:
# 11. coalesce
print("\n### 11. coalesce ###")
print("Description: Return a new RDD that is reduced into numPartitions partitions.")
#https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/rdd/RDD.scala#L506
#https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala
# Initial number of partitions
initial_partitions = logs_rdd.getNumPartitions()
print(f"Initial Partitions: {initial_partitions}")

# Coalesce to 4 partitions
coalesced_rdd_1 = logs_rdd.coalesce(2)
new_partitions_1 = coalesced_rdd_1.getNumPartitions()
print(f"new_partitions_1 after Coalesce: {new_partitions_1}")

# Coalesce to 50 partitions
coalesced_rdd_2 = logs_rdd.coalesce(10)
new_partitions_2 = coalesced_rdd_2.getNumPartitions()
print(f"new_partitions_2 after Coalesce: {new_partitions_2}")



### 11. coalesce ###
Description: Return a new RDD that is reduced into numPartitions partitions.
Initial Partitions: 8
new_partitions_1 after Coalesce: 2
new_partitions_2 after Coalesce: 8


## `sample` Function

In [0]:
# 12. sample
print("\n### 12. sample ###")
print("Description: Return a sampled subset of this RDD.")

# 01 Example: Sample 50% of the elements without replacement
simple_sample = rdd.sample(False, 0.5).collect()
print("01 sample example (50% sample):", simple_sample)

# example_Example: Sample 30% of the elements with replacement
example__sample = large_data.sample(True, 0.3).collect()
print("example_ sample example (30% sample with replacement):", example__sample)


### 12. sample ###
Description: Return a sampled subset of this RDD.
01 sample example (50% sample): [2, 5]
Complex sample example (30% sample with replacement): [1, 3, 6, 8, 9, 16, 16, 17, 20, 27, 32, 38, 39, 42, 44, 44, 48, 49, 57, 60, 61, 63, 69, 73, 78, 82, 88, 90, 94, 102, 106, 107, 112, 115, 118, 124, 128, 129, 131, 132, 134, 137, 144, 146, 153, 156, 161, 163, 163, 171, 176, 179, 181, 182, 188, 189, 195, 197, 199, 201, 202, 202, 209, 209, 218, 219, 224, 225, 240, 254, 258, 260, 261, 261, 261, 263, 267, 272, 276, 280, 281, 286, 292, 296, 297, 297, 297, 299, 303, 304, 305, 306, 306, 308, 309, 315, 318, 324, 325, 327, 327, 328, 335, 336, 337, 338, 346, 348, 355, 356, 359, 359, 363, 364, 370, 373, 374, 384, 389, 389, 396, 398, 400, 400, 404, 411, 412, 413, 414, 419, 419, 422, 423, 423, 426, 430, 431, 434, 434, 434, 439, 442, 443, 446, 454, 455, 457, 458, 461, 462, 468, 476, 478, 481, 482, 483, 483, 485, 485, 487, 487, 490, 491, 491, 494, 494, 496, 497, 500, 503, 507, 507, 509, 512, 

## `randomSplit` Function

In [0]:
# 13. randomSplit
print("\n### 13. randomSplit ###")
print("Description: Randomly splits this RDD with the provided weights.")

# 01 Example: Split into two parts with weights 0.7 and 0.3
simple_randomSplit = rdd.randomSplit([0.7, 0.3])
print("01 randomSplit example (70% and 30%):", [part.collect() for part in simple_randomSplit])

# example_Example: Split a large dataset into three parts with weights 0.5, 0.3, and 0.2
example__randomSplit = large_data.randomSplit([0.5, 0.3, 0.2])
print("example_ randomSplit example (50%, 30%, and 20%):", [part.count() for part in example__randomSplit])


### 13. randomSplit ###
Description: Randomly splits this RDD with the provided weights.
01 randomSplit example (70% and 30%): [[1, 2, 3, 5], [4]]
Complex randomSplit example (50%, 30%, and 20%): [477, 326, 197]


## `union` Function

In [0]:
# 14. union
print("\n### 14. union ###")
print("Description: Return the union of this RDD and another one.")

# 01 Example: Union two RDDs
rdd2 = sc.parallelize([6, 7, 8, 9, 10])
simple_union = rdd.union(rdd2).collect()
print("01 union example (union of two RDDs):", simple_union)

# example_Example: Union multiple RDDs
rdd3 = sc.parallelize([11, 12, 13])
example__union = rdd.union(rdd2).union(rdd3).collect()
print("example_ union example (union of multiple RDDs):", example__union)


### 14. union ###
Description: Return the union of this RDD and another one.
01 union example (union of two RDDs): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Complex union example (union of multiple RDDs): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


## `intersection` Function

In [0]:
# 15. intersection
print("\n### 15. intersection ###")
print("Description: Return the intersection of this RDD and another one.")

# 01 Example: Intersection of two RDDs
simple_intersection = rdd.intersection(rdd2).collect()
print("01 intersection example (intersection of two RDDs):", simple_intersection)

# example_Example: Intersection of large datasets
large_rdd1 = sc.parallelize(range(100, 200))
large_rdd2 = sc.parallelize(range(150, 250))
example__intersection = large_rdd1.intersection(large_rdd2).collect()
print("example_ intersection example (intersection of large datasets):", example__intersection)


### 15. intersection ###
Description: Return the intersection of this RDD and another one.
01 intersection example (intersection of two RDDs): []
Complex intersection example (intersection of large datasets): [160, 176, 192, 161, 177, 193, 162, 178, 194, 163, 179, 195, 164, 180, 196, 165, 181, 197, 150, 166, 182, 198, 151, 167, 183, 199, 152, 168, 184, 153, 169, 185, 154, 170, 186, 155, 171, 187, 156, 172, 188, 157, 173, 189, 158, 174, 190, 159, 175, 191]


## `subtract` Function

In [0]:
# 16. subtract
print("\n### 16. subtract ###")
print("Description: Return an RDD with the elements from this that are not in other.")

# 01 Example: Subtract elements of another RDD
simple_subtract = rdd.subtract(sc.parallelize([1, 2])).collect()
print("01 subtract example (subtract elements):", simple_subtract)

# example_Example: Subtract elements from a large dataset
example__subtract = large_data.subtract(sc.parallelize(range(10))).collect()
print("example_ subtract example (subtract elements from large dataset):", example__subtract)


### 16. subtract ###
Description: Return an RDD with the elements from this that are not in other.
01 subtract example (subtract elements): [3, 4, 5]
Complex subtract example (subtract elements from large dataset): [18, 36, 54, 72, 90, 108, 126, 144, 162, 180, 198, 216, 234, 252, 270, 288, 306, 324, 342, 360, 378, 396, 414, 432, 450, 468, 486, 504, 522, 540, 558, 576, 594, 612, 630, 648, 666, 684, 702, 720, 738, 756, 774, 792, 810, 828, 846, 864, 882, 900, 918, 936, 954, 972, 990, 19, 37, 55, 73, 91, 109, 127, 145, 163, 181, 199, 217, 235, 253, 271, 289, 307, 325, 343, 361, 379, 397, 415, 433, 451, 469, 487, 505, 523, 541, 559, 577, 595, 613, 631, 649, 667, 685, 703, 721, 739, 757, 775, 793, 811, 829, 847, 865, 883, 901, 919, 937, 955, 973, 991, 20, 38, 56, 74, 92, 110, 128, 146, 164, 182, 200, 218, 236, 254, 272, 290, 308, 326, 344, 362, 380, 398, 416, 434, 452, 470, 488, 506, 524, 542, 560, 578, 596, 614, 632, 650, 668, 686, 704, 722, 740, 758, 776, 794, 812, 830, 848, 866, 884, 902

## `groupBy` Function

In [0]:
# 17. groupBy
print("\n### 17. groupBy ###")
print("Description: Return an RDD of grouped items. Each group consists of a key and a sequence of elements mapping to that key.")

# 01 Example: Group numbers by even and odd
simple_groupBy = rdd.groupBy(lambda x: x % 2).mapValues(list).collect()
print("01 groupBy example (group by even and odd):", simple_groupBy)

# example_Example: Group words by their first letter
example__groupBy = words_rdd.groupBy(lambda x: x[0]).mapValues(list).collect()
print("example_ groupBy example (group words by first letter):", example__groupBy)


### 17. groupBy ###
Description: Return an RDD of grouped items. Each group consists of a key and a sequence of elements mapping to that key.
01 groupBy example (group by even and odd): [(0, [2, 4]), (1, [1, 3, 5])]
Complex groupBy example (group words by first letter): [('e', ['elephant']), ('c', ['cat', 'cat']), ('d', ['dog', 'dog'])]


## `cartesian` Function

In [0]:
# 18. cartesian
print("\n### 18. cartesian ###")
print("Description: Return the Cartesian product of this RDD and another one.")

# 01 Example: Cartesian product of two RDDs
simple_cartesian = rdd.cartesian(rdd2).collect()
print("01 cartesian example (Cartesian product):", simple_cartesian)

# example_Example: Cartesian product of large datasets
large_rdd1 = sc.parallelize(range(100, 105))
large_rdd2 = sc.parallelize(range(200, 205))
example__cartesian = large_rdd1.cartesian(large_rdd2).collect()
print("example_ cartesian example (Cartesian product of large datasets):", example__cartesian)


### 18. cartesian ###
Description: Return the Cartesian product of this RDD and another one.
01 cartesian example (Cartesian product): [(1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10)]
Complex cartesian example (Cartesian product of large datasets): [(100, 200), (100, 201), (100, 202), (100, 203), (100, 204), (101, 200), (101, 201), (101, 202), (101, 203), (101, 204), (102, 200), (102, 201), (102, 202), (102, 203), (102, 204), (103, 200), (103, 201), (103, 202), (103, 203), (103, 204), (104, 200), (104, 201), (104, 202), (104, 203), (104, 204)]


## `pipe` Function

In [0]:
# 19. pipe
print("\n### 19. pipe ###")
print("Description: Return an RDD created by piping elements to a forked external process.")

# 01 Example: Pipe elements to 'cat' command
simple_pipe = rdd.pipe('cat').collect()
print("01 pipe example (pipe to 'cat' command):", simple_pipe)

# example_Example: Pipe elements through a shell script
# Note: For demonstration purposes, assuming a script 'echo.sh' that echoes input
#example__pipe = rdd.pipe('./echo.sh').collect()
#print("example_ pipe example (pipe through 'echo.sh' script):", example__pipe)


### 19. pipe ###
Description: Return an RDD created by piping elements to a forked external process.
01 pipe example (pipe to 'cat' command): ['1', '2', '3', '4', '5']


## `zip` Function

In [0]:
# 20. zip
print("\n### 20. zip ###")
print("Description: Zips this RDD with another one, returning key-value pairs with the first element in each RDD, second element in each RDD, etc.")

# 01 Example: Zip two RDDs
simple_zip = rdd.zip(rdd2).collect()
print("01 zip example (zip two RDDs):", simple_zip)

# example_Example: Zip large datasets
large_rdd3 = sc.parallelize(range(1000, 1005))
large_rdd4 = sc.parallelize(range(2000, 2005))
example__zip = large_rdd3.zip(large_rdd4).collect()
print("example_ zip example (zip large datasets):", example__zip)


### 20. zip ###
Description: Zips this RDD with another one, returning key-value pairs with the first element in each RDD, second element in each RDD, etc.
01 zip example (zip two RDDs): [(1, 6), (2, 7), (3, 8), (4, 9), (5, 10)]
Complex zip example (zip large datasets): [(1000, 2000), (1001, 2001), (1002, 2002), (1003, 2003), (1004, 2004)]


## `zipPartitions` Function

In [0]:
# 21. zipPartitions
print("\n### 21. zipPartitions ###")
print("Description: Zip this RDD's partitions with one (or more) RDD(s) and return a new RDD by applying a function to the zipped partitions.")

# 01 Example: Zip partitions of two RDDs and sum their elements
simple_zipPartitions = rdd.zipPartitions(rdd2, lambda x, y: (a + b for a, b in zip(x, y))).collect()
print("01 zipPartitions example (sum elements in partitions):", simple_zipPartitions)

# example_Example: Zip partitions of large datasets and multiply their elements
example__zipPartitions = large_rdd3.zipPartitions(large_rdd4, lambda x, y: (a * b for a, b in zip(x, y))).collect()
print("example_ zipPartitions example (multiply elements in partitions):", example__zipPartitions)


### 21. zipPartitions ###
Description: Zip this RDD's partitions with one (or more) RDD(s) and return a new RDD by applying a function to the zipped partitions.


[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-721154795914227>:6[0m
[1;32m      3[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124mDescription: Zip this RDD[39m[38;5;124m'[39m[38;5;124ms partitions with one (or more) RDD(s) and return a new RDD by applying a function to the zipped partitions.[39m[38;5;124m"[39m)
[1;32m      5[0m [38;5;66;03m# 01 Example: Zip partitions of two RDDs and sum their elements[39;00m
[0;32m----> 6[0m simple_zipPartitions [38;5;241m=[39m rdd[38;5;241m.[39mzipPartitions(rdd2, [38;5;28;01mlambda[39;00m x, y: (a [38;5;241m+[39m b [38;5;28;01mfor[39;00m a, b [38;5;129;01min[39;00m [38;5;28mzip[39m(x, y)))[38;5;241m.[39mcollect()
[1;32m      7[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124m01 zipPartitions example (sum elements in partitions):[39m[38;5;124m"[39m, simple_zipP

## `mapPartitions` Function

In [0]:
# 22. mapPartitions
print("\n### 22. mapPartitions ###")
print("Description: Return a new RDD by applying a function to each partition of this RDD.")

# 01 Example: Sum elements within each partition
simple_mapPartitions = rdd.mapPartitions(lambda x: [sum(x)]).collect()
print("01 mapPartitions example (sum of elements in each partition):", simple_mapPartitions)

# example_Example: Find the maximum element within each partition
example__mapPartitions = large_data.mapPartitions(lambda x: [max(x)]).collect()
print("example_ mapPartitions example (max element in each partition):", example__mapPartitions)



## `mapPartitionsWithIndex` Function

In [0]:
# 23. mapPartitionsWithIndex
print("\n### 23. mapPartitionsWithIndex ###")
print("Description: Return a new RDD by applying a function to each partition of this RDD, while tracking the index of the original partition.")

# 01 Example: Add partition index to elements
simple_mapPartitionsWithIndex = rdd.mapPartitionsWithIndex(lambda idx, x: [(idx, e) for e in x]).collect()
print("01 mapPartitionsWithIndex example (add partition index):", simple_mapPartitionsWithIndex)

# example_Example: Find the partition index and max element within each partition
example__mapPartitionsWithIndex = large_data.mapPartitionsWithIndex(lambda idx, x: [(idx, max(x))]).collect()
print("example_ mapPartitionsWithIndex example (partition index and max element):", example__mapPartitionsWithIndex)



# Actions

## `collect` Function

In [0]:
# 24. collect
print("\n### 24. collect ###")
print("Description: Return an array that contains all of the elements in this RDD.")

# 01 Example: Collect elements of the RDD
simple_collect = rdd.collect()
print("01 collect example (collect elements):", simple_collect)

# example_Example: Collect elements of a large dataset
example__collect = large_data.collect()
print("example_ collect example (collect large dataset):", example__collect)



## `count` Function

In [0]:
# 25. count
print("\n### 25. count ###")
print("Description: Return the number of elements in the RDD.")

# 01 Example: Count the number of elements
simple_count = rdd.count()
print("01 count example (number of elements):", simple_count)

# example_Example: Count the number of distinct words
example__count = words_rdd.distinct().count()
print("example_ count example (number of unique words):", example__count)



## `Take` Function

In [0]:
# 26. take
print("\n### 26. take ###")
print("Description: Take the first num elements of the RDD.")

# 01 Example: Take the first 3 elements
simple_take = rdd.take(3)
print("01 take example (first 3 elements):", simple_take)

# example_Example: Take the first 10 elements from a large dataset
example__take = large_data.take(10)
print("example_ take example (first 10 elements):", example__take)



## `takeSample` Function

In [0]:
# 27. takeSample
print("\n### 27. takeSample ###")
print("Description: Return a fixed-size sampled subset of this RDD.")

# 01 Example: Take 3 samples without replacement
simple_takeSample = rdd.takeSample(False, 3)
print("01 takeSample example (3 samples):", simple_takeSample)

# example_Example: Take 5 samples with replacement
example__takeSample = large_data.takeSample(True, 5)
print("example_ takeSample example (5 samples with replacement):", example__takeSample)



## `foreach` Function

In [0]:
# 28. foreach
print("\n### 28. foreach ###")
print("Description: Applies a function f to all elements of this RDD.")

# 01 Example: Print each element
print("01 foreach example (print each element):")
rdd.foreach(lambda x: print(x))

# example_Example: Save each element to a file
# Note: For demonstration purposes, assuming a function save_to_file is defined
def save_to_file(x):
    with open('/tmp/output.txt', 'a') as f:
        f.write(f"{x}\n")

rdd.foreach(save_to_file)
print("example_ foreach example (save each element to a file): Check /tmp/output.txt")



## `foreachPartition` Function

In [0]:
# 29. foreachPartition
print("\n### 29. foreachPartition ###")
print("Description: Applies a function f to each partition of this RDD.")

# 01 Example: Print each partition
print("01 foreachPartition example (print each partition):")
rdd.foreachPartition(lambda x: print(list(x)))

# example_Example: Save each partition to a file
# Note: For demonstration purposes, assuming a function save_partition_to_file is defined
def save_partition_to_file(iterator):
    with open('/tmp/partition_output.txt', 'a') as f:
        for record in iterator:
            f.write(f"{record}\n")

rdd.foreachPartition(save_partition_to_file)
print("example_ foreachPartition example (save each partition to a file): Check /tmp/partition_output.txt")



## `aggregate` Function

In [0]:
# 30. aggregate
print("\n### 30. aggregate ###")
print("Description: Aggregate the elements of each partition, and then the results for all the partitions, using given combine functions and a neutral 'zero value'.")

# 01 Example: Aggregate sum of elements
simple_aggregate = rdd.aggregate(0, lambda x, y: x + y, lambda x, y: x + y)
print("01 aggregate example (sum of elements):", simple_aggregate)

# example_Example: Aggregate min and max of elements
example__aggregate = large_data.aggregate((float('inf'), float('-inf')),
                                         lambda acc, val: (min(acc[0], val), max(acc[1], val)),
                                         lambda acc1, acc2: (min(acc1[0], acc2[0]), max(acc1[1], acc2[1])))
print("example_ aggregate example (min and max of elements):", example__aggregate)



## `fold` Function

In [0]:
# 31. fold
print("\n### 31. fold ###")
print("Description: Aggregate the elements of each partition, and then the results for all the partitions, using a given associative function and a neutral 'zero value'.")

# 01 Example: Fold sum of elements
simple_fold = rdd.fold(0, lambda x, y: x + y)
print("01 fold example (sum of elements):", simple_fold)

# example_Example: Fold min and max of elements
example__fold = large_data.fold((float('inf'), float('-inf')),
                               lambda acc, val: (min(acc[0], val), max(acc[1], val)))
print("example_ fold example (min and max of elements):", example__fold)

