In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("My-App").getOrCreate()

In [3]:
sc = spark.sparkContext

## Basics Adding

In [4]:
nums1 = range(10)

In [5]:
nums1Rdd = sc.parallelize(nums1)

In [6]:
evens = nums1Rdd.map(lambda x: x-(x%3))

In [7]:
nums1Tuple = evens.map(lambda x: (x,10))

In [8]:
nums1Tuple.collect()

[(0, 10),
 (0, 10),
 (0, 10),
 (3, 10),
 (3, 10),
 (3, 10),
 (6, 10),
 (6, 10),
 (6, 10),
 (9, 10)]

In [9]:
nums1Tuple.reduceByKey(lambda x, y: x + y).collect()

[(0, 30), (9, 10), (6, 30), (3, 30)]

## Word Counts

In [10]:
lyrics = ["""Look at the stars
Look how they shine for you
And everything you do
Yeah they were all yellow
I came along
I wrote a song for you
And all the things you do
And it was called "Yellow"
So then I took my turn
Oh what a thing to have done
And it was all yellow
Your skin
Oh yeah, your skin and bones
Turn into something beautiful
You know, you know I love you so
You know I love you so
I swam across
I jumped across for you
Oh what a thing to do
'Cause you were all yellow
I drew a line
I drew a line for you
Oh what a thing to do
And it was all yellow
Your skin
Oh yeah your skin and bones
Turn into something beautiful
And you know
For you I'd bleed myself dry
For you I'd bleed myself dry
It's true
Look how they shine for you
Look how they shine for you
Look how they shine for
Look how they shine for you
Look how they shine for you
Look how they shine
Look at the stars
Look how they shine for you
And all the things that you do"""]

In [11]:
lyricsRdd = sc.parallelize(lyrics)

In [12]:
import re
lyricsFlat = lyricsRdd.flatMap(lambda x: re.split('; |, |\*|\n| ',x.lower()))

In [13]:
lyricsTuple = lyricsFlat.map(lambda x: (x,1))

In [14]:
WordCounts = lyricsTuple.reduceByKey(lambda x,y: x+y)

In [15]:
WordCounts.sortBy(lambda x: -x[1]).take(10)

[('you', 21),
 ('for', 12),
 ('look', 10),
 ('i', 9),
 ('and', 9),
 ('they', 9),
 ('how', 8),
 ('shine', 8),
 ('a', 6),
 ('all', 6)]

From Spark Website
> **Note**: If you are grouping in order to perform an aggregation (such as a sum or average) over each key, using reduceByKey or aggregateByKey will yield much better performance.

## Reference

  - <https://spark.apache.org/docs/2.1.1/programming-guide.html#rdd-operations>