In [1]:
import findspark
findspark.init()

In [130]:
spark.stop()

In [5]:
from pyspark.sql import SparkSession
# By default 12 executors if not specified
spark=SparkSession.builder.appName("RDD").master("local[4]").getOrCreate()
sc=spark.sparkContext

# Loading data

### Parallelized Collections

In [119]:
rdd=sc.parallelize([("a",7), ("a",2), ("b",2)])
rdd.collect()

[('a', 7), ('a', 2), ('b', 2)]

In [78]:
rdd2=sc.parallelize([('a',2), ('d',1), ('b',1), ('b',1)])
rdd2.collect()

[('a', 2), ('d', 1), ('b', 1), ('b', 1)]

In [10]:
rdd3=sc.parallelize(range(100))
rdd3.take(3)

[0, 1, 2]

In [11]:
rdd4=sc.parallelize([('a',['x','y','z']),
                     ('b',['p','r'])])
rdd4.collect()

[('a', ['x', 'y', 'z']), ('b', ['p', 'r'])]

### External data

In [23]:
# read all files and flatmap in 1 list
textFile=sc.textFile("data/textFiles/*.txt")
# textFile=sc.textFile("data/textFiles")
textFile.collect()

['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Collatio igitur ista te nihil iuvat. Honesta oratio, Socratica, Platonis etiam. Primum in nostrane potestate est, quid meminerimus? Duo Reges: constructio interrete. Quid, si etiam iucunda memoria est praeteritorum malorum? Si quidem, inquit, tollerem, sed relinquo. An nisi populari fama?',
 '',
 'Quamquam id quidem licebit iis existimare, qui legerint. Summum a vobis bonum voluptas dicitur. At hoc in eo M. Refert tamen, quo modo. Quid sequatur, quid repugnet, vident. Iam id ipsum absurdum, maximum malum neglegi.',
 'Aeque enim contingit omnibus fidibus, ut incontentae sint.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quae cum ita sint, effectum est nihil esse malum, quod turpe non sit. Itaque nostrum est-quod nostrum dico, artis est-ad ea principia, quae accepimus. Quod totum contra est. Duo Reges: constructio interrete. Atqui iste locus est, Piso, tibi etiam atque e

In [24]:
textFile.glom().collect()

[['Utilitatis causa amicitia est quaesita.',
  'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Collatio igitur ista te nihil iuvat. Honesta oratio, Socratica, Platonis etiam. Primum in nostrane potestate est, quid meminerimus? Duo Reges: constructio interrete. Quid, si etiam iucunda memoria est praeteritorum malorum? Si quidem, inquit, tollerem, sed relinquo. An nisi populari fama?',
  '',
  'Quamquam id quidem licebit iis existimare, qui legerint. Summum a vobis bonum voluptas dicitur. At hoc in eo M. Refert tamen, quo modo. Quid sequatur, quid repugnet, vident. Iam id ipsum absurdum, maximum malum neglegi.'],
 ['Aeque enim contingit omnibus fidibus, ut incontentae sint.',
  'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quae cum ita sint, effectum est nihil esse malum, quod turpe non sit. Itaque nostrum est-quod nostrum dico, artis est-ad ea principia, quae accepimus. Quod totum contra est. Duo Reges: constructio interrete. Atqui iste locus est, Piso, tibi etiam 

In [26]:
textFile.getNumPartitions()

3

In [14]:
# read from 1 file
textFile=sc.textFile("data/textFiles/sample1.txt")
textFile.collect()

['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Collatio igitur ista te nihil iuvat. Honesta oratio, Socratica, Platonis etiam. Primum in nostrane potestate est, quid meminerimus? Duo Reges: constructio interrete. Quid, si etiam iucunda memoria est praeteritorum malorum? Si quidem, inquit, tollerem, sed relinquo. An nisi populari fama?',
 '',
 'Quamquam id quidem licebit iis existimare, qui legerint. Summum a vobis bonum voluptas dicitur. At hoc in eo M. Refert tamen, quo modo. Quid sequatur, quid repugnet, vident. Iam id ipsum absurdum, maximum malum neglegi.']

In [19]:
# read all files as tuple in key as path and value as text file with '\n'
textFile=sc.wholeTextFiles("data/textFiles")
textFile.take(2)

[('file:/D:/Projects/Python/pyspark/Topic-wise/data/textFiles/sample1.txt',
  'Utilitatis causa amicitia est quaesita.\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Collatio igitur ista te nihil iuvat. Honesta oratio, Socratica, Platonis etiam. Primum in nostrane potestate est, quid meminerimus? Duo Reges: constructio interrete. Quid, si etiam iucunda memoria est praeteritorum malorum? Si quidem, inquit, tollerem, sed relinquo. An nisi populari fama?\n\nQuamquam id quidem licebit iis existimare, qui legerint. Summum a vobis bonum voluptas dicitur. At hoc in eo M. Refert tamen, quo modo. Quid sequatur, quid repugnet, vident. Iam id ipsum absurdum, maximum malum neglegi.'),
 ('file:/D:/Projects/Python/pyspark/Topic-wise/data/textFiles/sample2.txt',
  'Aeque enim contingit omnibus fidibus, ut incontentae sint.\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Quae cum ita sint, effectum est nihil esse malum, quod turpe non sit. Itaque nostrum est-quod nostrum dico, art

# Retrieving RDD info

In [20]:
rdd.getNumPartitions() # equals total number of cores

4

In [32]:
rdd.count()  # Count RDD instances
#[('a', 7), ('a', 2), ('b', 2)]

3

In [28]:
rdd.countByKey() # only key counted
#[('a', 7), ('a', 2), ('b', 2)]

defaultdict(int, {'a': 2, 'b': 1})

In [29]:
rdd.countByValue() # the whole () is considered and counted
#[('a', 7), ('a', 2), ('b', 2)]

defaultdict(int, {('a', 7): 1, ('a', 2): 1, ('b', 2): 1})

In [38]:
print(rdd.glom().collect())
rdd.collectAsMap() #Return (key,value) pairs as a dictionary
#[('a', 7), ('a', 2), ('b', 2)]

[[], [('a', 7)], [('a', 2)], [('b', 2)]]


{'a': 2, 'b': 2}

In [41]:
rdd3.sum()
# range(100) -> 0-99

4950

In [39]:
# Check whether RDD is emp
sc.parallelize([]).isEmpty()

True

In [42]:
# range(100) -> 0-99
rdd3.max()

99

In [43]:
rdd3.min()

0

In [44]:
rdd3.mean() #mean is the average of a data set

49.5

In [46]:
'''A standard deviation is a statistic that measures the dispersion of a 
dataset relative to its mean. The standard deviation is calculated as the 
square root of variance by determining each data point's deviation relative 
to the mean. If the data points are further from the mean, there is a 
higher deviation within the data set; thus, the more spread out the data, 
the higher the standard deviation.'''
rdd3.stdev()

28.86607004772212

In [47]:
'''variance is the expectation of the squared deviation of a random variable 
from its mean. Variance is a measure of dispersion, meaning it is a measure 
of how far a set of numbers is spread out from their average value.'''
rdd3.variance()

833.25

In [49]:
rdd3.histogram(3) #Compute histogram by bins

([0, 33, 66, 99], [33, 33, 34])

In [50]:
rdd3.stats()

(count: 100, mean: 49.5, stdev: 28.86607004772212, max: 99.0, min: 0.0)

# Selecting data (Getting)

In [51]:
rdd.collect() #Return a list with all RDD elements

[('a', 7), ('a', 2), ('b', 2)]

In [61]:
rdd.take(5) #Take first 5 RDD elements

[('a', 7), ('a', 2), ('b', 2)]

In [55]:
rdd.first() 

('a', 7)

In [59]:
rdd.top(3) #Take top 3 RDD elements

[('b', 2), ('a', 7), ('a', 2)]

# Selecting data (Sampling)

In [72]:
rdd3.sample(False,0.10,81).collect()
# with replacement=false, total=10%, seed=81

[4, 26, 39, 41, 42, 52, 63, 76, 80, 86, 97]

# Selecting data (Filterig)

In [76]:
print(rdd.filter(lambda x: "a" in x).collect())
print(rdd.filter(lambda x: 7 in x).collect())

[('a', 7), ('a', 2)]
[('a', 7)]


In [80]:
rdd2.distinct().collect()  # Return distinct RDD values
# [('a', 2), ('d', 1), ('b', 1), ('b', 1)]

[('a', 2), ('b', 1), ('d', 1)]

In [81]:
rdd.keys().collect() # Return (key,value) RDD's keys

['a', 'a', 'b']

# Iterating (not working)

In [82]:
def g(x): print(x)

rdd.foreach(g)

# Applying functions

In [83]:
# Apply a function to each RDD element 
rdd.map(lambda x: x+(x[1],x[0])).collect()

[('a', 7, 7, 'a'), ('a', 2, 2, 'a'), ('b', 2, 2, 'b')]

In [84]:
# Apply a function to each RDD element and flatten the result
rdd.flatMap(lambda x: x+(x[1],x[0])).collect()

['a', 7, 7, 'a', 'a', 2, 2, 'a', 'b', 2, 2, 'b']

In [87]:
# Apply a flatMap function to each (key,value) pair of rdd4 without changing the keys
rdd4.flatMapValues(lambda x: x).collect() 
# [('a', ['x', 'y', 'z']), ('b', ['p', 'r'])]

[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]

# Sort and Order By

In [88]:
rdd2.sortBy(lambda x: x[1]).collect()

[('d', 1), ('b', 1), ('b', 1), ('a', 2)]

In [89]:
rdd2.sortByKey().collect()

[('a', 2), ('b', 1), ('b', 1), ('d', 1)]

In [91]:
# orderBy in DF only

# Mathematical Operations

In [92]:
rdd.subtract(rdd2).collect()
# [('a', 7), ('a', 2), ('b', 2)]
# [('a', 2), ('d', 1), ('b', 1), ('b', 1)]

[('b', 2), ('a', 7)]

In [93]:
rdd2.subtractByKey(rdd).collect()
# [('a', 2), ('d', 1), ('b', 1), ('b', 1)]
# [('a', 7), ('a', 2), ('b', 2)]

[('d', 1)]

In [94]:
rdd.cartesian(rdd2).collect() #Return the Cartesian product of rdd and rdd2
# [('a', 7), ('a', 2), ('b', 2)]
# [('a', 2), ('d', 1), ('b', 1), ('b', 1)]

[(('a', 7), ('a', 2)),
 (('a', 7), ('d', 1)),
 (('a', 7), ('b', 1)),
 (('a', 7), ('b', 1)),
 (('a', 2), ('a', 2)),
 (('a', 2), ('d', 1)),
 (('a', 2), ('b', 1)),
 (('a', 2), ('b', 1)),
 (('b', 2), ('a', 2)),
 (('b', 2), ('d', 1)),
 (('b', 2), ('b', 1)),
 (('b', 2), ('b', 1))]

# Reshaping data (Reducing)

In [95]:
rdd.reduceByKey(lambda x,y: x+y).collect()
# [('a', 7), ('a', 2), ('b', 2)]

[('b', 2), ('a', 9)]

In [97]:
rdd.reduce(lambda x,y: x+y)
# [('a', 7), ('a', 2), ('b', 2)]

('a', 7, 'a', 2, 'b', 2)

# Reshaping data (Grouping By)

In [100]:
rdd3.groupBy(lambda x: x%2).mapValues(list).collect()
# range(100)

[(0,
  [0,
   2,
   4,
   6,
   8,
   10,
   12,
   14,
   16,
   18,
   20,
   22,
   24,
   26,
   28,
   30,
   32,
   34,
   36,
   38,
   40,
   42,
   44,
   46,
   48,
   50,
   52,
   54,
   56,
   58,
   60,
   62,
   64,
   66,
   68,
   70,
   72,
   74,
   76,
   78,
   80,
   82,
   84,
   86,
   88,
   90,
   92,
   94,
   96,
   98])]

In [101]:
rdd.groupByKey().mapValues(list).collect()
# [('a', 7), ('a', 2), ('b', 2)]

[('b', [2]), ('a', [7, 2])]

# Reshaping data (Aggregating)

In [102]:
seqOp = (lambda x,y: (x[0]+y, x[1]+1))
combOp = (lambda x,y: (x[0]+y[0], x[1]+y[1]))
#Aggregate RDD elements of each partition and then the results
rdd3.aggregate((0,0),seqOp,combOp)
# range(100)

(4950, 100)

In [104]:
#Aggregate values of each RDD key
rdd.aggregateByKey((0,0),seqOp,combOp).collect()

[('b', (2, 1)), ('a', (9, 2))]

In [108]:
#Aggregate the elements of each partition, and then the results
def add(a,b): return a+b
rdd3.fold(0,add)

4950

In [114]:
#Merge the values for each key
rdd.foldByKey(0, add).collect()

[('b', 2), ('a', 9)]

In [110]:
#Create tuples of RDD elements by applying a function
rdd3.keyBy(lambda x: x+x).collect()

[(0, 0),
 (2, 1),
 (4, 2),
 (6, 3),
 (8, 4),
 (10, 5),
 (12, 6),
 (14, 7),
 (16, 8),
 (18, 9),
 (20, 10),
 (22, 11),
 (24, 12),
 (26, 13),
 (28, 14),
 (30, 15),
 (32, 16),
 (34, 17),
 (36, 18),
 (38, 19),
 (40, 20),
 (42, 21),
 (44, 22),
 (46, 23),
 (48, 24),
 (50, 25),
 (52, 26),
 (54, 27),
 (56, 28),
 (58, 29),
 (60, 30),
 (62, 31),
 (64, 32),
 (66, 33),
 (68, 34),
 (70, 35),
 (72, 36),
 (74, 37),
 (76, 38),
 (78, 39),
 (80, 40),
 (82, 41),
 (84, 42),
 (86, 43),
 (88, 44),
 (90, 45),
 (92, 46),
 (94, 47),
 (96, 48),
 (98, 49),
 (100, 50),
 (102, 51),
 (104, 52),
 (106, 53),
 (108, 54),
 (110, 55),
 (112, 56),
 (114, 57),
 (116, 58),
 (118, 59),
 (120, 60),
 (122, 61),
 (124, 62),
 (126, 63),
 (128, 64),
 (130, 65),
 (132, 66),
 (134, 67),
 (136, 68),
 (138, 69),
 (140, 70),
 (142, 71),
 (144, 72),
 (146, 73),
 (148, 74),
 (150, 75),
 (152, 76),
 (154, 77),
 (156, 78),
 (158, 79),
 (160, 80),
 (162, 81),
 (164, 82),
 (166, 83),
 (168, 84),
 (170, 85),
 (172, 86),
 (174, 87),
 (176, 88

# Repartitioning / coalesce

In [125]:
from pyspark.sql.functions import col
rdd.toDF(["val","count"]).repartition(5,col("count")).rdd.glom().collect()

[[],
 [Row(val='a', count=7)],
 [],
 [Row(val='a', count=2), Row(val='b', count=2)],
 []]

In [126]:
rdd.toDF(["val","count"]).repartition(5,col("val")).rdd.glom().collect()

[[Row(val='a', count=7), Row(val='a', count=2)],
 [Row(val='b', count=2)],
 [],
 [],
 []]

In [127]:
rdd.coalesce(1).collect()

[('a', 7), ('a', 2), ('b', 2)]

# Saving

In [128]:
rdd.saveAsTextFile('save/rdddemo1')

In [129]:
rdd.coalesce(1).saveAsTextFile('save/rdddemo2')