In [1]:
from pyspark import SparkContext

# Initialize SparkContext
sc = SparkContext("local", "RDD Pipeline Example")

In [2]:
# Load dataset
data = [
    "1,101,200,Electronics,2025-01-01",
    "2,102,50,Groceries,2025-01-01",
    "3,101,300,Electronics,2025-01-02",
    "4,103,20,Clothing,2025-01-03",
    "5,104,150,Electronics,2025-01-04"
]
rdd = sc.parallelize(data)

In [3]:
# Transformation 1: Split each line into a list
rdd_split = rdd.map(lambda line: line.split(','))

In [4]:
# Transformation 2: Filter transactions with amount > 100
rdd_filtered = rdd_split.filter(lambda x: int(x[2]) > 100)

In [5]:
# Transformation 3: Map to (customer_id, amount)
rdd_mapped = rdd_filtered.map(lambda x: (x[1], int(x[2])))

In [6]:
# Transformation 4: Reduce by key to get total amount spent per customer
rdd_reduced = rdd_mapped.reduceByKey(lambda x, y: x + y)

In [7]:
# Transformation 5: Sort customers by total spending in descending order
rdd_sorted = rdd_reduced.sortBy(lambda x: x[1], ascending=False)

In [8]:
# Action: Collect the results
results = rdd_sorted.collect()

In [9]:
# Display results
for result in results:
    print(f"Customer ID: {result[0]}, Total Spending: {result[1]}")

# Stop SparkContext
sc.stop()

Customer ID: 101, Total Spending: 500
Customer ID: 104, Total Spending: 150
