In [7]:
''' Pipelining
    
    Spark will try to compine as mutch posible narrow transfomrations in a single stage and apply them on a partition
    On a wide transformation spark will create a shuffle and a new stage
    On a shuffle the data are written on Shuffle files (Unsafe rows or Tungsten Binary Format) on disk and send to other executors over the network for the next stage
    
    
    Avoid shuffle operations if is possible
    Repartition data properly
    Filter data as earlier
'''

' Pipelining\n    \n    Spark will try to compine as mutch posible narrow transfomrations in a single stage and apply them on a partition\n    On a wide transformation spark will create a shuffle and a new stage\n    On a shuffle the data are written on Shuffle files (Unsafe rows or Tungsten Binary Format) on disk and send to other executors over the network for the next stage\n'

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from sockets")
    .master("local[*]")
    .config("spark.cores.max", 16)
    .config("spark.executor.cores",4)
    .getOrCreate()
)

spark

In [2]:
spark.conf.set("spark.sql.adaptive.enabled",False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled",False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

In [3]:
schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"
emp = spark.read.format("csv").schema(schema).option("header", True).load("/home/jovyan/data/employee_records.csv")

In [4]:
''' 
    Spark will create a job with two stages
    The first stage has 16 tasks because i have 16 cores and will read the data
    Then each task will write 10 records in shuffle files (1 row fir each department in each partition)
    Then the second stage will have 200 tasks becasue of default.shuffle partitions but only 10 tasks will do job
    
'''

from pyspark.sql.functions import avg

amp_avg = emp.groupBy("department_id").agg(avg("salary").alias("avg_sal"))

amp_avg.write.format("noop").mode("overwrite").save()

In [5]:
from pyspark.sql.functions import spark_partition_id

emp.withColumn("part", spark_partition_id()).groupBy("part").count().show()

+----+-----+
|part|count|
+----+-----+
|  12|65152|
|   1|65190|
|  13|65160|
|   6|65176|
|   3|65211|
|   5|65238|
|  15|22131|
|   9|65172|
|   4|65162|
|   8|65212|
|   7|65217|
|  10|65206|
|  11|65197|
|  14|65150|
|   2|65195|
|   0|65231|
+----+-----+



In [5]:
''' 
    By setting the spark.sql.shuffle.partitions to much lower will create much less tasks on shuffle and will be utilized better and faster
'''
spark.conf.set("spark.sql.shuffle.partitions",16)


emp_avg = emp.groupBy("department_id").agg(avg("salary").alias("avg_sal"))

emp_avg.write.format("noop").mode("overwrite").save()