In [17]:
// In python use: from pyspark.sql.functions import broadcast, split, lit
import org.apache.spark.sql.functions.{broadcast, split, lit}

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

val matches = spark.read.option("header", "true")
                                    .option("inferSchema", "true")
                                    .csv("/home/iceberg/data/matches.csv")

val matchDetailsBucketed = spark.read.option("header", "true")
                                        .option("inferSchema", "true")
                                        .csv("/home/iceberg/data/match_details.csv")

import spark.implicits._
val matchesBucketed = matches.where($"completion_date" === "2016-01-01")
                                        

import org.apache.spark.sql.functions.{broadcast, split, lit}
matches: org.apache.spark.sql.DataFrame = [match_id: string, mapid: string ... 8 more fields]
matchDetailsBucketed: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 34 more fields]
import spark.implicits._
matchesBucketed: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [match_id: string, mapid: string ... 8 more fields]


In [2]:
spark.sql(s"drop table if exists bootcamp.matches_bucketed")
val bucketedDDL = f"""
    create table if not exists bootcamp.matches_bucketed (
        match_id string,
        is_team_game boolean,
        playlist_id string,
        completion_date timestamp
    )
    using iceberg
    partitioned by (completion_date, bucket(16, match_id))
"""
spark.sql(bucketedDDL)

bucketedDDL: String =
"
    create table if not exists bootcamp.matches_bucketed (
        match_id string,
        is_team_game boolean,
        playlist_id string,
        completion_date timestamp
    )
    using iceberg
    partitioned by (completion_date, bucket(16, match_id))
"
res0: org.apache.spark.sql.DataFrame = []


In [22]:
matchesBucketed.select(
    $"match_id",
    $"is_team_game",
    $"playlist_id",
    $"completion_date"
)
.write.mode("overwrite")
.partitionBy("completion_date")
.bucketBy(16, "match_id")
.saveAsTable("bootcamp.matches_bucketed")

In [10]:
spark.sql("drop table if exists bootcamp.match_details_bucketed")
val bucketedDetailsDDL = f"""
    create table if not exists bootcamp.match_details_bucketed (
        match_id string,
        player_gamertag string,
        player_total_kills integer,
        player_total_deaths integer
    )
    using iceberg
    partitioned by (bucket(16, match_id))
"""
spark.sql(bucketedDetailsDDL)

bucketedDetailsDDL: String =
"
    create table if not exists bootcamp.match_details_bucketed (
        match_id string,
        player_gamertag string,
        player_total_kills integer,
        player_total_deaths integer
    )
    using iceberg
    partitioned by (bucket(16, match_id))
"
res7: org.apache.spark.sql.DataFrame = []


In [11]:
matchDetailsBucketed.select(
    $"match_id",
    $"player_gamertag",
    $"player_total_kills",
    $"player_total_deaths"
)
.write.mode("overwrite")
.bucketBy(16, "match_id")
.saveAsTable("bootcamp.match_details_bucketed")

In [24]:
matchesBucketed.createOrReplaceTempView("matches")
matchDetailsBucketed.createOrReplaceTempView("match_details")

In [23]:
spark.sql("""
    select *
    from bootcamp.match_details_bucketed mdb
    join bootcamp.matches_bucketed mb
    on mdb.match_id = mb.match_id
    and mb.completion_date = date('2016-01-01')
""").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#978], [match_id#982], Inner
   :- Sort [match_id#978 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#978, 200), ENSURE_REQUIREMENTS, [plan_id=395]
   :     +- BatchScan demo.bootcamp.match_details_bucketed[match_id#978, player_gamertag#979, player_total_kills#980, player_total_deaths#981] demo.bootcamp.match_details_bucketed (branch=null) [filters=match_id IS NOT NULL, groupedBy=] RuntimeFilters: []
   +- Sort [match_id#982 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(match_id#982, 200), ENSURE_REQUIREMENTS, [plan_id=396]
         +- BatchScan demo.bootcamp.matches_bucketed[match_id#982, is_team_game#983, playlist_id#984, completion_date#985] demo.bootcamp.matches_bucketed (branch=null) [filters=completion_date IS NOT NULL, completion_date = 1451606400000000, match_id IS NOT NULL, groupedBy=] RuntimeFilters: []




In [25]:
spark.sql("""
    select *
    from match_details md
    join matches m
    on md.match_id = m.match_id
""").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#583], [match_id#546], Inner
   :- Sort [match_id#583 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#583, 200), ENSURE_REQUIREMENTS, [plan_id=428]
   :     +- Filter isnotnull(match_id#583)
   :        +- FileScan csv [match_id#583,player_gamertag#584,previous_spartan_rank#585,spartan_rank#586,previous_total_xp#587,total_xp#588,previous_csr_tier#589,previous_csr_designation#590,previous_csr#591,previous_csr_percent_to_next_tier#592,previous_csr_rank#593,current_csr_tier#594,current_csr_designation#595,current_csr#596,current_csr_percent_to_next_tier#597,current_csr_rank#598,player_rank_on_team#599,player_finished#600,player_average_life#601,player_total_kills#602,player_total_headshots#603,player_total_weapon_damage#604,player_total_shots_landed#605,player_total_melee_kills#606,... 12 more fields] Batched: false, DataFilters: [isnotnull(match_id#583)], Format: CSV, Location: InMemo

In [16]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "1000000000000")

val broadcastFromThreshold = matchesBucketed.as("m")
    .join(matchDetailsBucketed.as("md"), 
        $"m.match_id" === $"md.match_id")
    .select($"m.completion_date", $"md.player_gamertag", $"md.player_total_kills")
    .take(5)

broadcastFromThreshold: Array[org.apache.spark.sql.Row] = Array([2016-01-01 00:00:00.0,EcZachly,7], [2016-01-01 00:00:00.0,Hernan Crespo,11], [2016-01-01 00:00:00.0,xHBKxTheTruthx,6], [2016-01-01 00:00:00.0,PRGUY18,1], [2016-01-01 00:00:00.0,Ash All Mighty,3])


In [18]:
val explicitBroadcast = broadcast(matchesBucketed).as("m")
    .join(matchDetailsBucketed.as("md"),
        $"m.match_id" === $"md.match_id")
    .select($"md.*", split($"completion_date", " ").getItem(0).as("ds"))

explicitBroadcast: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 35 more fields]


In [19]:
val bucketedValues = matchDetailsBucketed.as("md")
    .join(matchesBucketed.as("m"),
        $"md.match_id" === $"m.match_id").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#583], [match_id#546], Inner
   :- Sort [match_id#583 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#583, 200), ENSURE_REQUIREMENTS, [plan_id=338]
   :     +- Filter isnotnull(match_id#583)
   :        +- FileScan csv [match_id#583,player_gamertag#584,previous_spartan_rank#585,spartan_rank#586,previous_total_xp#587,total_xp#588,previous_csr_tier#589,previous_csr_designation#590,previous_csr#591,previous_csr_percent_to_next_tier#592,previous_csr_rank#593,current_csr_tier#594,current_csr_designation#595,current_csr#596,current_csr_percent_to_next_tier#597,current_csr_rank#598,player_rank_on_team#599,player_finished#600,player_average_life#601,player_total_kills#602,player_total_headshots#603,player_total_weapon_damage#604,player_total_shots_landed#605,player_total_melee_kills#606,... 12 more fields] Batched: false, DataFilters: [isnotnull(match_id#583)], Format: CSV, Location: InMemo

bucketedValues: Unit = ()
