In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from sockets")
    .master("local[*]")
    .config("spark.cores.max", 4)
    .config("spark.executor.cores",2)
    .getOrCreate()
)

spark

In [2]:
spark.conf.set("spark.sql.adaptive.enabled",False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled",False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

In [3]:
emp_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"
emp = spark.read.format("csv").schema(emp_schema).option("header", True).load("/home/jovyan/data/employee_records.csv")


dep_schema = "department_id int, department_name string, description string, coty string, state string, country string"
dep = spark.read.format("csv").schema(dep_schema).option("header", True).load("/home/jovyan/data/department.csv")

In [4]:
df_joined = emp.join(dep, on=emp.department_id == dep.department_id, how="left_outer")

df_joined.write.format("noop").mode("overwrite").save()

df_joined.explain()

== Physical Plan ==
*(4) SortMergeJoin [department_id#7], [department_id#16], LeftOuter
:- *(1) Sort [department_id#7 ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(department_id#7, 200), ENSURE_REQUIREMENTS, [id=#70]
:     +- FileScan csv [first_name#0,last_name#1,job_title#2,dob#3,email#4,phone#5,salary#6,department_id#7] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/data/employee_records.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<first_name:string,last_name:string,job_title:string,dob:string,email:string,phone:string,s...
+- *(3) Sort [department_id#16 ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(department_id#16, 200), ENSURE_REQUIREMENTS, [id=#82]
      +- *(2) Filter isnotnull(department_id#16)
         +- FileScan csv [department_id#16,department_name#17,description#18,coty#19,state#20,country#21] Batched: false, DataFilters: [isnotnull(department_id#16)], Format: CSV, Locatio

In [5]:
''' 
    When we see  Spill (Memory) Spill (Disk) it means that some of the tasks process much data than other,and the data are not fit to memory because we have skew
    Spill (Disk) is bad because the spark need to serialize them to put it to disk and den deserialize them again to process them
'''
from pyspark.sql.functions import spark_partition_id, count

df_joined.withColumn("partition_id", spark_partition_id()).groupBy("partition_id").count().show()

+------------+------+
|partition_id| count|
+------------+------+
|         103|100417|
|         122| 99780|
|          43| 99451|
|         107| 99805|
|          49| 99706|
|          51|100248|
|         102|100214|
|          66|100210|
|         174|100155|
|          89|100014|
+------------+------+



In [17]:
'''
    Salting: add salt (random number on joining column to split fair the rowes between tasks)
             after the join we remove the salt
             
             The big table will salted
             The small table will cross joined with salt range df
'''
import random
from pyspark.sql.functions import udf, concat, lit


@udf
def add_salt():
    return random.randint(0,16)
    
    
spark.conf.set("spark.shuffle.partitions", 16)

salt_df = spark.range(16)
salt_df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
+---+



In [22]:
salted_employ = emp.withColumn("salted_department", concat("department_id", lit("_"), add_salt()))
salted_employ.show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-----------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|salted_department|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-----------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|            8|             8_10|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|            7|             7_11|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|           10|             10_2|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|            1|              1_6|
|  Michelle|   Elliott|    

In [23]:
salted_dep = dep.join(salt_df, how='cross').withColumn("salted_department", concat("department_id", lit("_"), "id"))
salted_dep.show()

+-------------+--------------------+--------------------+--------------------+-----+-------------------+---+-----------------+
|department_id|     department_name|         description|                coty|state|            country| id|salted_department|
+-------------+--------------------+--------------------+--------------------+-----+-------------------+---+-----------------+
|            1|         Bryan-James|Optimized disinte...|        Melissaburgh|   FM|Trinidad and Tobago|  0|              1_0|
|            2|Smith, Craig and ...|Digitized empower...|          Morrisside|   DE|          Sri Lanka|  0|              2_0|
|            3|Pittman, Hess and...|Multi-channeled c...|         North David|   SC|       Turkmenistan|  0|              3_0|
|            4|Smith, Snyder and...|Reactive neutral ...|       Lake Jennifer|   TX|         Madagascar|  0|              4_0|
|            5|          Hardin Inc|Re-contextualized...|           Hayestown|   WA|               Fiji|  0|   

In [25]:
salted_join = salted_employ.join(salted_dep, on=salted_employ.salted_department == salted_dep.salted_department, how="left_outer")

In [26]:
salted_join.withColumn("partition_id", spark_partition_id()).groupBy("partition_id").count().show()

+------------+-----+
|partition_id|count|
+------------+-----+
|          31|11965|
|         137|11861|
|         101|11920|
|         126|11799|
|          81|11770|
|         183| 5860|
|          76| 5926|
|          26| 5896|
|          27|11760|
|         192| 5733|
|          91| 5788|
|         122|11974|
|          93|23462|
|          47| 5910|
|         152| 5807|
|         185| 5771|
|         146| 5961|
|          52| 5770|
|         182| 5901|
|         168| 5837|
+------------+-----+
only showing top 20 rows



In [27]:
salted_join.write.format("noop").mode("overwrite").save()