In [1]:
import os

# Replace "../../../../" with the actual absolute path to your home directory
home_directory = "../../../../"
os.environ["SPARK_HOME"] = os.path.join(home_directory, "spark-3.3.2-bin-hadoop3")
spark_python = os.path.join(os.environ["SPARK_HOME"], "python")
py4j_path = os.path.join(spark_python, "lib", "py4j-*.zip")


In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("spark_with_RDDs") \
    .getOrCreate()
spark

24/02/23 11:49:02 WARN Utils: Your hostname, Endiesworld resolves to a loopback address: 127.0.1.1; using 172.22.195.180 instead (on interface eth0)
24/02/23 11:49:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/02/23 11:49:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## How to create RDDs

In [3]:
numbers = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(numbers)

In [4]:
# Collect action: Retrieve all elements of the RDD
rdd.collect()

                                                                                

[1, 2, 3, 4, 5]

In [5]:
# Create an RDD from a list of tuples
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35), ("Alice", 40)]
rdd = spark.sparkContext.parallelize(data)

In [6]:
# Collect action: Retrieve all elements of the RDD
print("All elements of the rdd: ", rdd.collect())

All elements of the rdd:  [('Alice', 25), ('Bob', 30), ('Charlie', 35), ('Alice', 40)]


## RDDs Operation: Actions

In [7]:
# Count action: Count the number of elements in the RDD
count = rdd.count()
print("The total number of elements in rdd: ", count)

[Stage 2:>                                                          (0 + 8) / 8]

The total number of elements in rdd:  4


                                                                                

In [8]:
# First action: Retrieve the first element of the RDD
first_element = rdd.first()
print("The first element of the rdd: ", first_element)

The first element of the rdd:  ('Alice', 25)


In [9]:
# Take action: Retrieve the n elements of the RDD
taken_elements = rdd.take(2)
print("The first two elements of the rdd: ", taken_elements)

The first two elements of the rdd:  [('Alice', 25), ('Bob', 30)]


In [10]:
# Foreach action: Print each element of the RDD
rdd.foreach(lambda x: print(x))

('Bob', 30)
('Alice', 25)
('Alice', 40)
('Charlie', 35)


## RDDs Operation: Transformations

In [11]:
# Map transformation: Convert name to uppercase
mapped_rdd = rdd.map(lambda x: (x[0].upper(), x[1]))

In [12]:
result = mapped_rdd.collect()
print("rdd with uppercease name: ", result)

rdd with uppercease name:  [('ALICE', 25), ('BOB', 30), ('CHARLIE', 35), ('ALICE', 40)]


In [13]:
# Filter transformation: Filter records where age is greater than 30
filtered_rdd = rdd.filter(lambda x: x[1] > 30)
filtered_rdd.collect()

[('Charlie', 35), ('Alice', 40)]

In [14]:
# ReduceByKey transformation: Calculate the total age for each name
reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)
reduced_rdd.collect()

                                                                                

[('Charlie', 35), ('Bob', 30), ('Alice', 65)]

In [15]:
# SortBy transformation: Sort the RDD by age in descending order
sorted_rdd = rdd.sortBy(lambda x: x[1], ascending=False)
sorted_rdd.collect()

[('Alice', 40), ('Charlie', 35), ('Bob', 30), ('Alice', 25)]

## Save RDDs to text file and read RDDs from text file

In [16]:
# Save action: Save the RDD to a text file
rdd.saveAsTextFile("output.txt")

In [17]:
# create rdd from text file
rdd_text = spark.sparkContext.textFile("output.txt")
rdd_text.collect()

["('Charlie', 35)", "('Alice', 40)", "('Alice', 25)", "('Bob', 30)"]

In [18]:
spark.stop()