# RDD - Resilient Distributed Dataset

In [21]:
import os
os.environ['SPARK_HOME'] = r"C:\Users\Dani\Documents\Python Scripts\Spark"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [22]:
from pyspark.sql import SparkSession

In [24]:
spark = SparkSession.builder.appName("My4App").getOrCreate()

In [25]:
numbers = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(numbers)

In [26]:
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:287

In [27]:
rdd.collect()

[1, 2, 3, 4, 5]

In [28]:
# Create an RDD from a list of car-related tuples
data = [
    ("BMW", 2024, 4.5),
    ("Audi", 2023, 2.0),
    ("BMW", 2021, 3.0),
    ("Tesla", 2022, 0.0)
]

rdd = spark.sparkContext.parallelize(data)


In [29]:
# Collect action: Retrieve all elements of the RDD
print("All elements of the rdd: ", rdd.collect())

All elements of the rdd:  [('BMW', 2024, 4.5), ('Audi', 2023, 2.0), ('BMW', 2021, 3.0), ('Tesla', 2022, 0.0)]


## RDDs Operation: Actions

In [30]:
# Count action: Count the number of elements in the RDD
count = rdd.count()
print("The total number of elements in rdd: ", count)

The total number of elements in rdd:  4


In [31]:
# First action: Retrieve the first element of the RDD
first_element = rdd.first()
print("The first element of the rdd: ", first_element)

The first element of the rdd:  ('BMW', 2024, 4.5)


In [32]:
# Take action: Retrieve the n elements of the RDD
taken_elements = rdd.take(2)
print("The first two elements of the rdd: ", taken_elements)

The first two elements of the rdd:  [('BMW', 2024, 4.5), ('Audi', 2023, 2.0)]


In [34]:
# Foreach action: Print each element of the RDD
rdd.foreach(lambda x: print(x))

## RDDS Operation: Transformations

In [43]:
# Map transformation: Convert name to uppercase
mapped_rdd = rdd.map(lambda x: (x[0].upper(), str(x[1]) + " year", str(x[2])+"l"))

In [44]:
mapped_rdd.take(2)

[('BMW', '2024 year', '4.5l'), ('AUDI', '2023 year', '2.0l')]

In [None]:
# Filter transformation: Filter records where engine is greater then 2.5l
filtered_rdd = rdd.filter(lambda x: x[2] > 2.5)
filtered_rdd.collect()

[('BMW', 2024, 4.5), ('BMW', 2021, 3.0)]

In [51]:
# ReduceByKey transformation:
reduced_rdd = rdd.map(lambda x: (x[0], x[2])) 
reduced_rdd = reduced_rdd.reduceByKey(lambda x, y: x + y)
reduced_rdd.collect()

[('Tesla', 0.0), ('Audi', 2.0), ('BMW', 7.5)]

In [47]:
# SortBy transformation: Sort the RDD by year in descending order
sorted_rdd = rdd.sortBy(lambda x: x[1], ascending=False)
sorted_rdd.collect()

[('BMW', 2024, 4.5),
 ('Audi', 2023, 2.0),
 ('Tesla', 2022, 0.0),
 ('BMW', 2021, 3.0)]

## Save RDDs

In [None]:
# Save action: Save the RDD to a text file
rdd.saveAsTextFile("output.txt")

In [None]:
# create rdd from text file
rdd_text = spark.sparkContext.textFile("output.txt")
rdd_text.collect()

In [None]:
spark.stop()