In [1]:
from pyspark.sql import SparkSession
# SparkSession only takes in 1 argument, it uses these functions to set attributes and then returns itself
# slow because it has to setup the JVM (and other steps)
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/02 02:55:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sc = spark.sparkContext

In [3]:
nums = list(range(1_000_000))
rdd = sc.parallelize(nums)

In [6]:
# 2 transformations, avoid the div by 0 by filtering out 0
inverses = rdd.filter(lambda x: x > 0).map(lambda x: 1/x)    # TRANSFORMATION: perform the function on every row in the source (rdd)
inverses    # result is still an RDD

PythonRDD[2] at RDD at PythonRDD.scala:53

In [5]:
inverses.take(10)

23/11/02 02:55:50 WARN TaskSetManager: Stage 0 contains a task of very large size (2332 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

[1.0,
 0.5,
 0.3333333333333333,
 0.25,
 0.2,
 0.16666666666666666,
 0.14285714285714285,
 0.125,
 0.1111111111111111,
 0.1]

In [7]:
inverses.mean()

23/11/02 02:56:50 WARN TaskSetManager: Stage 1 contains a task of very large size (2332 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

1.4392740115605892e-05

In [8]:
# we're getting the 'large task' warning because we have very big partitions (1 task works on 1 partition)
rdd.getNumPartitions()

2

In [9]:
# Spark automatically decided to use 2 partitions but we can change this
rdd = sc.parallelize(nums, 10)
rdd.getNumPartitions()

10

In [10]:
# with smaller partitions, now we don't get the warning
inverses = rdd.filter(lambda x: x > 0).map(lambda x: 1/x)
inverses.mean()

                                                                                

1.4392740115605814e-05

In [None]:
# When Spark displays progress as: 4 + 2 / 10
# 4 tasks completed
# 2 tasks running (this should correspond to the number of cores)
# 10 tasks total (this should correspond to the number of partitions)

## Caching

In [13]:
# sample is a TRANSFORMATION
sample = rdd.sample(withReplacement=True, fraction=0.1, seed=544)   # seed not quite deterministic, depends on partitioning

In [14]:
import time

In [15]:
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0



498504.761576394


                                                                                

3.7317185401916504

In [16]:
sample.cache()

PythonRDD[7] at RDD at PythonRDD.scala:53

In [17]:
# The first time, the caching actually makes things slower (extra work to cache)
# so if you're only doing it once, caching just introduces extra overhead, not worth it
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0



498504.761576394


                                                                                

6.3785624504089355

In [18]:
# after caching though, now the mean calculation using the cache is faster (don't need to resample)
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0



498504.761576394


                                                                                

3.223809242248535

In [19]:
# not much faster though...that's because our sample is still across 10 partitions (Spark uses a narrow transformation)
# to get truly better performance, we can tell Spark to put the sample into just 1 partition (therefore 1 task)
sample = rdd.sample(withReplacement=True, fraction=0.1, seed=544).repartition(1).cache()

In [20]:
# still slow initially with the repartition
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0

[Stage 7:>                                                          (0 + 1) / 1]

498504.76157639234


                                                                                

5.6407365798950195

In [22]:
# but now, after repartitioning and caching, the mean calculation is much faster! (6 seconds -> 1 second)
t0 = time.time()
print(sample.mean())
t1 = time.time()
t1 - t0

[Stage 11:>                                                         (0 + 1) / 1]

498504.76157639234


                                                                                

1.106959342956543