In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

## RDDs

In [2]:
rdd = sc.parallelize([1,2,3,4], numSlices=4).map(lambda x: x + 1)

In [3]:
rdd.collect()

[2, 3, 4, 5]

## Named RDDs

In [4]:
named_rdd = sc.parallelize([3,4,5,6], numSlices=4).setName('createRDD').map(lambda x: x + 1).setName('add 1')

In [5]:
named_rdd.collect()

[4, 5, 6, 7]

## Two Stages

In [6]:
two_stages = sc.parallelize([9,10,11,12], numSlices=4).setName('createRDD') \
                .map(lambda x: (x, 1)).setName('make key value') \
                .reduceByKey(lambda x, y: x + y).setName('reduceSum')

In [7]:
two_stages.collect()

[(12, 1), (9, 1), (10, 1), (11, 1)]

## Repeated Computation - No Cache

In [8]:
def slowdown(value):
    import time
    time.sleep(5)
    return value

initialRDD = sc.parallelize([9,10,11,12], numSlices=4).setName('createRDD')
slowComp = initialRDD.map(slowdown).setName('slowComp')
remap1 = slowComp.map(lambda key: (key, 1)).setName('remap 1')
remap2 = slowComp.map(lambda key: (key + 1, 1)).setName('remap 2')
joined = remap1.join(remap2).setName('joined')

In [9]:
joined.collect()

[(10, (1, 1)), (11, (1, 1)), (12, (1, 1))]

## Repeated Computation - Cache

In [10]:
initialRDD = sc.parallelize([21,22,23,24], numSlices=4).setName('createRDD')
slowComp = initialRDD.map(slowdown).cache().setName('slowComp')
remap1 = slowComp.map(lambda key: (key, 1)).setName('remap 1')
remap2 = slowComp.map(lambda key: (key + 1, 1)).setName('remap 2')
joined = remap1.join(remap2).setName('joined')

In [11]:
joined.collect()

[(24, (1, 1)), (22, (1, 1)), (23, (1, 1))]

In [12]:
sc.stop()