In [None]:
'''
    When joining table A and table B
    Spark read the data of both tables in partitions
    Each executor takes different keys
    
    To join tables Spark will shuffle the data to bring same keys on the executor
    Then can join the data
'''

'''
Joins strategies:
    Shuffle Hash:
                    1. Shuffle the data
                    2. Hash table on the smaller table
                    3. Match the hashed rows on big table
                    4. Join
                    
                    When one of the two tables is small and feed to memory
                    Does not need the data sorted
                    
    Sort merge:
                    1. Shuffle
                    2. Sort both tables
                    3. Merge join
                    
                    When have two big tables
                    
    Broadcast join:
                    
                    1. Broadcast smaller table
                    2. Make hash join
                    
                    When have a small table
                    Default broalcast threshold is 10MB and can be configured up to 8gb
                    
'''

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from sockets")
    .master("spark://4d64623e658a:7077")
    .getOrCreate()
)

spark

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.IllegalArgumentException: requirement failed: Can only call getServletHandlers on a running MetricsSystem
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.metrics.MetricsSystem.getServletHandlers(MetricsSystem.scala:89)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:650)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)


In [2]:
spark.conf.set("spark.sql.adaptive.enabled",False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled",False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)

In [3]:
schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"
emp = spark.read.format("csv").schema(schema).option("header", True).load("/home/jovyan/data/employee_records.csv")

In [4]:
dep = spark.read.format("csv").option("inferSchema", True).option("header", True).load("/home/jovyan/data/department.csv")

In [8]:
dep.printSchema()

root
 |-- department_id: integer (nullable = true)
 |-- department_name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)



In [5]:
df_joined = emp.join(dep, on=emp.department_id == dep.department_id, how="left_outer")

In [6]:
df_joined.write.format("noop").mode("overwrite").save()

In [12]:
# Spark will choose SortMergeJoin join algo to do the join
df_joined.explain()

== Physical Plan ==
*(4) SortMergeJoin [department_id#36], [department_id#62], LeftOuter
:- *(1) Sort [department_id#36 ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(department_id#36, 200), ENSURE_REQUIREMENTS, [id=#113]
:     +- FileScan csv [first_name#29,last_name#30,job_title#31,dob#32,email#33,phone#34,salary#35,department_id#36] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/data/employee_records.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<first_name:string,last_name:string,job_title:string,dob:string,email:string,phone:string,s...
+- *(3) Sort [department_id#62 ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(department_id#62, 200), ENSURE_REQUIREMENTS, [id=#125]
      +- *(2) Filter isnotnull(department_id#62)
         +- FileScan csv [department_id#62,department_name#63,description#64,city#65,state#66,country#67] Batched: false, DataFilters: [isnotnull(department_id#62)], Format:

In [8]:
'''
    Formce broadcast join
    
'''

from pyspark.sql.functions import broadcast

df_joined_broadcast = emp.join(broadcast(dep), on=emp.department_id == dep.department_id, how="left_outer")
df_joined_broadcast.write.format("noop").mode("overwrite").save()

In [9]:
df_joined_broadcast.explain()

== Physical Plan ==
*(2) BroadcastHashJoin [department_id#7], [department_id#33], LeftOuter, BuildRight, false
:- FileScan csv [first_name#0,last_name#1,job_title#2,dob#3,email#4,phone#5,salary#6,department_id#7] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/data/employee_records.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<first_name:string,last_name:string,job_title:string,dob:string,email:string,phone:string,s...
+- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#133]
   +- *(1) Filter isnotnull(department_id#33)
      +- FileScan csv [department_id#33,department_name#34,description#35,city#36,state#37,country#38] Batched: false, DataFilters: [isnotnull(department_id#33)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/jovyan/data/department.csv], PartitionFilters: [], PushedFilters: [IsNotNull(department_id)], ReadSchema: struct<department_

In [10]:
# Optimize two big tables
# If we try to broadcast one of the two we will have error because the size of table is too large



city_schema = "city_id string, city string, state string, state_abv string, country string"

city = spark.read.format("csv").schema(city_schema).option("header", True).load("/home/jovyan/data/cities.csv")

In [11]:
sales_schema = "transacted_at string, trx_id string, retailer_id string, description string, amount double, city_id string"

sales = spark.read.format("csv").schema(sales_schema).option("header", True).load("/home/jovyan/data/new_sales.csv")

In [12]:
df_sales_joined = sales.join(city, on=sales.city_id==city.city_id, how="left_outer")
df_sales_joined.write.format("noop").mode("overwrite").save()

In [16]:
''' Bucketing
    
    Create bukets and using hash function (murmur) assign the rows to buckets
    Then the same buckets will be read from the same executor
    Bucket number should be the same for both datasets
    Bucketing works only with tables 
    
    bucketBy(4,"city_id"): number of buckets, key column
    
    We will have number_of_partitions * number_of_buckets files in that table
    
    The idea of bucketing is to write the data as table that is bucketed and on reading we will avoid shuffle because the same bucket will go to the same executor
'''

sales.write.format("csv").mode("overwrite").bucketBy(4,"city_id").option("header", True).option("path", "/home/jovyan/data/sales_bucket.csv").saveAsTable("sales_bucket")
city.write.format("csv").mode("overwrite").bucketBy(4,"city_id").option("header", True).option("path", "/home/jovyan/data/city_bucket.csv").saveAsTable("city_bucket")

In [19]:
# view the tables
spark.sql("show tables in default;").show()

+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
|  default| city_bucket|      false|
|  default|sales_bucket|      false|
+---------+------------+-----------+



In [21]:
'''
    Notes about bucketing:
        If joining column is not the bucket column, same bucket size  ==> shuffle both tables
        If joining column is the bucket column, one table in bucket   ==> shuffle non bucket table
        If joining column is the bucket column, different bucket size ==> shuffle on smaller bucket side
        If joining column is the bucket column, same bucket size      ==> No shuffle (fast join)

'''


# read the tables and perform the join

sales_table = spark.read.table("sales_bucket")
cities_table = spark.read.table("city_bucket")

df_joined_bucket = sales_table.join(cities_table, on=sales_table.city_id==cities_table.city_id, how="left_outer")
df_joined_bucket.write.format("noop").mode("overwrite").save()