In [1]:
%%capture
!pip install pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master('local[1]').appName("SparkTestRDD").getOrCreate()

## Basic Information

In [4]:
data = [1,2,3,4,5,6,7,8,9,10,11,12]

In [5]:
rdd=spark.sparkContext.parallelize(data)

In [6]:
rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [7]:
print(f'Count RDD instances - {rdd.count()}')
print(f'Sum of RDD elements - {rdd.sum()}')
print(f'Maximum value of RDD elements - {rdd.max()}')
print(f'Minimum value of RDD elements - {rdd.min()}')
print(f'Mean value of RDD elements - {rdd.mean()}')
print(f'Standard value of RDD elements - {rdd.stdev()}')
print(f'Compute variance of RDD elements - {rdd.variance()}')
print(f'Compute histogram by bins - {rdd.histogram(2)}')
print(f'Summary statistics - {rdd.stats()}')

Count RDD instances - 12
Sum of RDD elements - 78
Maximum value of RDD elements - 12
Minimum value of RDD elements - 1
Mean value of RDD elements - 6.5
Standard value of RDD elements - 3.452052529534663
Compute variance of RDD elements - 11.916666666666666
Compute histogram by bins - ([1.0, 6.5, 12], [6, 6])
Summary statistics - (count: 12, mean: 6.5, stdev: 3.452052529534663, max: 12.0, min: 1.0)


In [8]:
data2 = [('a',7), ('a', 2), ('b', 2), ('b',4), ('c',1), ('c',2), ('c',3), ('c',4)]

In [9]:
rdd2=spark.sparkContext.parallelize(data2)

In [10]:
rdd2.collect()

[('a', 7),
 ('a', 2),
 ('b', 2),
 ('b', 4),
 ('c', 1),
 ('c', 2),
 ('c', 3),
 ('c', 4)]

In [11]:
print(f'Count RDD instances by key - {rdd2.countByKey()}')
print(f'Count RDD instances by value - {rdd2.countByValue()}')
print(f'Return (key, value) pairs as a dictionary - {rdd2.collectAsMap()}')

Count RDD instances by key - defaultdict(<class 'int'>, {'a': 2, 'b': 2, 'c': 4})
Count RDD instances by value - defaultdict(<class 'int'>, {('a', 7): 1, ('a', 2): 1, ('b', 2): 1, ('b', 4): 1, ('c', 1): 1, ('c', 2): 1, ('c', 3): 1, ('c', 4): 1})
Return (key, value) pairs as a dictionary - {'a': 2, 'b': 4, 'c': 4}


## Selecting Data

In [12]:
print(f'Take first 2 RDD elements - {rdd.take(2)}')
print(f'Take first RDD element - {rdd.first()}')
print(f'Take top 2 RDD elements - {rdd.top(2)}')
print(f'Filter the RDD - {rdd.filter(lambda x: x>=5).collect()}')
print(f'Return RDD keys - {rdd2.keys().collect()}')
print(f'Return distinct RDD values{rdd2.distinct().collect()}')

Take first 2 RDD elements - [1, 2]
Take first RDD element - 1
Take top 2 RDD elements - [12, 11]
Filter the RDD - [5, 6, 7, 8, 9, 10, 11, 12]
Return RDD keys - ['a', 'a', 'b', 'b', 'c', 'c', 'c', 'c']
Return distinct RDD values[('a', 7), ('a', 2), ('b', 2), ('b', 4), ('c', 1), ('c', 2), ('c', 3), ('c', 4)]


## Applying Functions

In [13]:
#Apply a function to each RDD element
rdd.map(lambda x: x**2).collect()

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144]

In [14]:
rdd2.map(lambda x: x+(x[1]**2,x[0]*2)).collect()

[('a', 7, 49, 'aa'),
 ('a', 2, 4, 'aa'),
 ('b', 2, 4, 'bb'),
 ('b', 4, 16, 'bb'),
 ('c', 1, 1, 'cc'),
 ('c', 2, 4, 'cc'),
 ('c', 3, 9, 'cc'),
 ('c', 4, 16, 'cc')]

In [15]:
data3 = [("a", ["x", "y", "z"]), ("b", ["p", "r"])]

In [16]:
rdd3 = spark.sparkContext.parallelize(data3)

In [17]:
# Apply a flatMap fucntion to each (key,value) pair of rdd without changing the keys
rdd3.flatMapValues(lambda x: x).collect()

[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]

## Sort

In [18]:
# Sort RDD by given fucntion
rdd2.sortBy(lambda x: x[1]).collect()

[('c', 1),
 ('a', 2),
 ('b', 2),
 ('c', 2),
 ('c', 3),
 ('b', 4),
 ('c', 4),
 ('a', 7)]

In [19]:
# Sort (key, value) RDD by key
rdd2.sortByKey(ascending=False).collect()

[('c', 1),
 ('c', 2),
 ('c', 3),
 ('c', 4),
 ('b', 2),
 ('b', 4),
 ('a', 7),
 ('a', 2)]

## Reshaping Data

In [20]:
# Merge the rdd values for each key
rdd2.reduceByKey(lambda x,y: x+y).collect()

[('a', 9), ('b', 6), ('c', 10)]

In [21]:
# Merge the RDD values
rdd.reduce(lambda x,y: x+y)

78

In [22]:
# Return RDD of grouped values
rdd.groupBy(lambda x:x%2).mapValues(list).collect()

[(1, [1, 3, 5, 7, 9, 11]), (0, [2, 4, 6, 8, 10, 12])]

In [23]:
# Group RDD by key
rdd2.groupByKey().mapValues(list).collect()

[('a', [7, 2]), ('b', [2, 4]), ('c', [1, 2, 3, 4])]

## Mathematical Operations

In [24]:
data4 = [1,2,3,4,5,6]

In [25]:
rdd4 = spark.sparkContext.parallelize(data4)

In [26]:
rdd.subtract(rdd4).collect()

[8, 10, 12, 7, 9, 11]

In [27]:
rdd.intersection(rdd4).collect()

[2, 4, 6, 1, 3, 5]

In [28]:
rdd.union(rdd4).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6]

## Join

In [29]:
data5 = [('a',10), ('a', 11), ('b', 20), ('c',30)]
data6 = [('a',12), ('b', 21), ('c',31)]

In [30]:
rdd5 = spark.sparkContext.parallelize(data5)
rdd6 = spark.sparkContext.parallelize(data6)

In [31]:
rdd5.join(rdd6).collect()

[('b', (20, 21)), ('c', (30, 31)), ('a', (10, 12)), ('a', (11, 12))]