https://www.datacamp.com/cheat-sheet/pyspark-cheat-sheet-spark-in-python

Initializing Spark 
SparkContext 

In [1]:
# from pyspark import SparkContext
# sc = SparkContext(master = 'local[2]')
# sc

Inspect SparkContext 

In [2]:

# # Print SparkContext information
# print("SparkContext version: ", sc.version)
# print("Python version: ", sc.pythonVer)
# print("Master URL: ", sc.master)
# print("Path where Spark is installed: ", sc.sparkHome)
# print("Spark User: ", sc.sparkUser())
# print("Application name: ", sc.appName)
# print("Application ID: ", sc.applicationId)
# print("Default level of parallelism: ", sc.defaultParallelism)
# print("Default minimum number of partitions for RDDs: ", sc.defaultMinPartitions)

Configuration 

In [3]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("My app").set("spark.executor.memory", "1g")
sc = SparkContext(conf = conf)

Using the Shell 

In [4]:
# $ ./bin/spark-shell --master local[2]
# $ ./bin/pyspark --master local[s] --py-files code.py

In [5]:
# !spark-shell --master local[2]
# !pyspark --master local[s] --py-files code.py

In [6]:
rdd = sc.parallelize([('a',7),('a',2),('b',2)])
rdd2 = sc.parallelize([('a',2),('d',1),('b',1)])
rdd3 = sc.parallelize(range(100))
rdd4 = sc.parallelize([("a",["x","y","z"]), ("b", ["p","r"])])

Basic Information 

In [7]:
# List the number of partitions
print(rdd.getNumPartitions())

# Count RDD instances
print(rdd.count())

# Count RDD instances by key
print(rdd.countByKey()) # Returns a defaultdict(<type 'int'>,{'a':2,'b':1})

# Count RDD instances by value
print(rdd.countByValue()) # Returns a defaultdict(<type 'int'>,{('b',2):1,('a',2):1,('a',7):1})

# Return (key,value) pairs as a dictionary
print(rdd.collectAsMap()) # Returns {'a': 2, 'b': 2}

# Sum of RDD elements
print(rdd3.sum()) # Returns 4950

# Check whether RDD is empty
print(sc.parallelize([]).isEmpty()) # Returns True

1
3
defaultdict(<class 'int'>, {'a': 2, 'b': 1})
defaultdict(<class 'int'>, {('a', 7): 1, ('a', 2): 1, ('b', 2): 1})
{'a': 2, 'b': 2}
4950
True


In [None]:
# # Create an RDD from a text file
# file_path = "/my/directory/file.txt" # Replace "•" with an actual file name
# text_rdd = sc.textFile(file_path)

# # Create an RDD that contains the contents of multiple text files
# directory_path = "/my/directory"
# text_rdd2 = sc.wholeTextFiles(directory_path)

Summary 

In [9]:

# Compute statistics
max_val = rdd3.max()
min_val = rdd3.min()
mean_val = rdd3.mean()
stdev_val = rdd3.stdev()
variance_val = rdd3.variance()
histogram_val = rdd3.histogram(3)
stats_val = rdd3.stats()

# Print results
print("Maximum value: {}".format(max_val))
print("Minimum value: {}".format(min_val))
print("Mean value: {}".format(mean_val))
print("Standard deviation: {}".format(stdev_val))
print("Variance: {}".format(variance_val))
print("Histogram: {}".format(histogram_val))
print("Summary statistics: {}".format(stats_val))


Maximum value: 99
Minimum value: 0
Mean value: 49.5
Standard deviation: 28.86607004772212
Variance: 833.25
Histogram: ([0, 33, 66, 99], [33, 33, 34])
Summary statistics: (count: 100, mean: 49.5, stdev: 28.86607004772212, max: 99.0, min: 0.0)


Applying Functions 

In [16]:
#Apply a function to each RFD element>>> 
print(rdd.map(lambda x: x+(x[1],x[0])).collect() )#[('a' ,7,7, 'a'),('a' ,2,2, 'a'), ('b' ,2,2, 'b')]
#Apply a function to each RDD element and flatten the result>>> 
rdd5 = rdd.flatMap(lambda x: x+(x[1],x[0]))
print(rdd5.collect()) #['a',7 , 7 ,  'a' , 'a' , 2,  2,  'a', 'b', 2 , 2, 'b']
#Apply a flatMap function to each (key,value) pair of rdd4 without changing the keys>>> 
print(rdd4.flatMapValues(lambda x: x).collect()) #[('a', 'x'), ('a', 'y'), ('a', 'z'),('b', 'p'),('b', 'r')]

[('a', 7, 7, 'a'), ('a', 2, 2, 'a'), ('b', 2, 2, 'b')]
['a', 7, 7, 'a', 'a', 2, 2, 'a', 'b', 2, 2, 'b']
[('a', 'x'), ('a', 'y'), ('a', 'z'), ('b', 'p'), ('b', 'r')]


Selecting Data
- Getting

In [18]:

# Collect all RDD elements and return them as a list
print(rdd.collect())

# Take the first 2 elements of the RDD
print(rdd.take(2))

# Take the first element of the RDD
print(rdd.first())

# Take the top 2 elements of the RDD based on their values
print(rdd.top(2))

[('a', 7), ('a', 2), ('b', 2)]
[('a', 7), ('a', 2)]
('a', 7)
[('b', 2), ('a', 7)]


In [19]:
rdd3.sample(False, 0.15, 81).collect() #Return sampled subset of rdd3     [3,4,27,31,40,41,42,43,60,76,79,80,86,97]

[3, 4, 27, 28, 35, 41, 43, 49, 53, 58, 85, 93]

In [20]:
print(rdd.filter(lambda x: "a" in x).collect()) #Filter the RDD[('a',7),('a',2)]>>> 
print(rdd5.distinct().collect()) #Return distinct RDD values['a' ,2, 'b',7]>>> 
print(rdd.keys().collect()) #Return (key,value) RDD's keys['a',  'a',  'b']

[('a', 7), ('a', 2)]
['a', 7, 2, 'b']
['a', 'a', 'b']


In [29]:

def g(x): print(x)

rdd.foreach(g) # Apply a function to all RDD elements

In [28]:
rdd.foreach(lambda x: g(x)) # apply g(x) using a lambda function

Reshaping Data 

In [33]:
# Reducing
print(rdd.reduceByKey(lambda x, y: x + y).collect()) # Merge the rdd values for each key [('a', 9), ('b', 2)]
print(rdd.reduce(lambda a, b: a+ b)) #Merge the rdd values('a', 7, 'a' , 2 , 'b' , 2)
print(rdd.values().sum())  # Sum all the rdd values: 11

[('a', 9), ('b', 2)]
('a', 7, 'a', 2, 'b', 2)
11


In [34]:
# Grouping
# Group RDD values by even and odd numbers, and return values as a list for each group
print(rdd3.groupBy(lambda x: x % 2).mapValues(list).collect())

# Group RDD values by key and return values as a list for each key
rdd.groupByKey().mapValues(list).collect()

[(0, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98]), (1, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99])]


[('a', [7, 2]), ('b', [2])]

In [36]:
# Aggregating
def add(x, y):
    return x + y
# Define seqOp and combOp functions for aggregation
seqOp = lambda x, y: (x[0] + y, x[1] + 1)
combOp = lambda x, y: (x[0] + y[0], x[1] + y[1])

# Aggregate RDD elements of each partition and then combine the results
rdd3.aggregate((0, 0), seqOp, combOp)

# Aggregate values of each RDD key
result = rdd.aggregateByKey((0, 0), seqOp, combOp).collect()
print(result)

# Aggregate the elements of each partition, and then combine the results
rdd3.fold(0, add)

# Merge the values for each key
result = rdd.foldByKey(0, add).collect()
print(result)

# Create tuples of RDD elements by applying a function
result = rdd3.keyBy(lambda x: x + x).collect()
print(result)

[('a', (9, 2)), ('b', (2, 1))]
[('a', 9), ('b', 2)]
[(0, 0), (2, 1), (4, 2), (6, 3), (8, 4), (10, 5), (12, 6), (14, 7), (16, 8), (18, 9), (20, 10), (22, 11), (24, 12), (26, 13), (28, 14), (30, 15), (32, 16), (34, 17), (36, 18), (38, 19), (40, 20), (42, 21), (44, 22), (46, 23), (48, 24), (50, 25), (52, 26), (54, 27), (56, 28), (58, 29), (60, 30), (62, 31), (64, 32), (66, 33), (68, 34), (70, 35), (72, 36), (74, 37), (76, 38), (78, 39), (80, 40), (82, 41), (84, 42), (86, 43), (88, 44), (90, 45), (92, 46), (94, 47), (96, 48), (98, 49), (100, 50), (102, 51), (104, 52), (106, 53), (108, 54), (110, 55), (112, 56), (114, 57), (116, 58), (118, 59), (120, 60), (122, 61), (124, 62), (126, 63), (128, 64), (130, 65), (132, 66), (134, 67), (136, 68), (138, 69), (140, 70), (142, 71), (144, 72), (146, 73), (148, 74), (150, 75), (152, 76), (154, 77), (156, 78), (158, 79), (160, 80), (162, 81), (164, 82), (166, 83), (168, 84), (170, 85), (172, 86), (174, 87), (176, 88), (178, 89), (180, 90), (182, 91), 

Mathematical Operations 

In [37]:
# Return each rdd value not contained in rdd2
print(rdd.subtract(rdd2).collect())  # [('b' ,2), ('a' ,7)]

# Return each (key,value) pair of rdd2 with no matching key in rdd
print(rdd2.subtractByKey(rdd).collect())  # [('d', 1)]

# Return the Cartesian product of rdd and rdd2
rdd.cartesian(rdd2).collect()

[('a', 7), ('b', 2)]
[('d', 1)]


[(('a', 7), ('a', 2)),
 (('a', 7), ('d', 1)),
 (('a', 7), ('b', 1)),
 (('a', 2), ('a', 2)),
 (('a', 2), ('d', 1)),
 (('a', 2), ('b', 1)),
 (('b', 2), ('a', 2)),
 (('b', 2), ('d', 1)),
 (('b', 2), ('b', 1))]

In [38]:
#Sort 
# RDD by value ascendingly
print(rdd2.sortBy(lambda x: x[1]).collect())

#Sort (key, value) RDD by key ascendingly 
rdd2.sortByKey().collect()

[('d', 1), ('b', 1), ('a', 2)]


[('a', 2), ('b', 1), ('d', 1)]

In [39]:
# repartition
rdd = rdd.repartition(4) # Creating new RDD with 4 partitions
rdd = rdd.coalesce(1) # Decreasing the number of partitions to 1

In [40]:
# Save RDD as a text file
rdd.saveAsTextFile("tmp_rdd.txt")

# Save RDD as a Hadoop file
output_format = 'org.apache.hadoop.mapred.TextOutputFormat'
# file_path = 'hdfs://namenodehost/parent/child'
file_path = './tmp_child'
rdd.saveAsHadoopFile(file_path, output_format)

In [None]:
# Stopping SparkContext 

sc.stop()


In [None]:
# Execution 

$ ./bin/spark-submit examples/src/main/python/pi.py