In [1]:
import pyspark


In [2]:
# create entry points to spark
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

## Map

In [3]:
rdd=sc.textFile('mtcars.csv')
rdd.take(3)

['model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb',
 'Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4',
 'Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4']

In [6]:
header=rdd.collect()[0]
header

'model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb'

In [12]:
data_rdd=rdd.filter(lambda x:x != header)
data_rdd.take(2)

['Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4',
 'Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4']

In [13]:
data_rdd=data_rdd.map(lambda x:x.split(',')).map(lambda x:(x[0],x[1:]))
data_rdd.take(3)

[('Mazda RX4',
  ['21', '6', '160', '110', '3.9', '2.62', '16.46', '0', '1', '4', '4']),
 ('Mazda RX4 Wag',
  ['21', '6', '160', '110', '3.9', '2.875', '17.02', '0', '1', '4', '4']),
 ('Datsun 710',
  ['22.8', '4', '108', '93', '3.85', '2.32', '18.61', '1', '1', '4', '1'])]

In [14]:
#convert the numeric values to float type
data_rdd_2=data_rdd.map(lambda x:(x[0],list(map(float,x[1]))))
data_rdd_2.take(2)

[('Mazda RX4',
  [21.0, 6.0, 160.0, 110.0, 3.9, 2.62, 16.46, 0.0, 1.0, 4.0, 4.0]),
 ('Mazda RX4 Wag',
  [21.0, 6.0, 160.0, 110.0, 3.9, 2.875, 17.02, 0.0, 1.0, 4.0, 4.0])]

## MapValues

In [16]:
import numpy as np
rdd_mapValues=data_rdd_2.mapValues(lambda x:np.mean(x))
rdd_mapValues.take(3)

[('Mazda RX4', 29.90727272727273),
 ('Mazda RX4 Wag', 29.98136363636364),
 ('Datsun 710', 23.59818181818182)]

## flatMap

In [18]:
x = [('a', 'b', 'c'), ('a', 'a'), ('c', 'c', 'c', 'd')]
exp_rdd = sc.parallelize(x)
exp_rdd.collect()

[('a', 'b', 'c'), ('a', 'a'), ('c', 'c', 'c', 'd')]

In [20]:
flatmap_exp_rdd=exp_rdd.flatMap(lambda x:x)
flatmap_exp_rdd.collect()

['a', 'b', 'c', 'a', 'a', 'c', 'c', 'c', 'd']

## flatMapValues

In [22]:
# example data
my_data = [
    [1, (23, 28, 32)],
    [2, (18, 29, 31)],
    [3, (34, 21, 18)]
]
exp2_rdd = sc.parallelize(my_data)
exp2_rdd.collect()

[[1, (23, 28, 32)], [2, (18, 29, 31)], [3, (34, 21, 18)]]

In [25]:
flatMapValues_exp_rdd=exp2_rdd.flatMapValues(lambda x:list(zip(list('ABC'),x)))
flatMapValues_exp_rdd.collect()

[(1, ('A', 23)),
 (1, ('B', 28)),
 (1, ('C', 32)),
 (2, ('A', 18)),
 (2, ('B', 29)),
 (2, ('C', 31)),
 (3, ('A', 34)),
 (3, ('B', 21)),
 (3, ('C', 18))]