In [2]:
import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [3]:
users = spark.read.format('csv').option('header', 'true').load('users.csv')

In [4]:
orders = spark.read.format('csv').option('header', 'true').load('orders.csv')

In [6]:
joinedDF = orders.join(users, users.uid == orders.user_id)
joinedDF.show()

+--------+---+----------+-------+---+---------+----------+
|order_id|qty|product_id|user_id|uid|user_name|user_state|
+--------+---+----------+-------+---+---------+----------+
|       0| 44|         4|     62| 62|  user_62|        MI|
|       1| 61|         0|      5|  5|   user_5|        NY|
|       2| 40|         4|     13| 13|  user_13|        CO|
|       3| 98|         2|     31| 31|  user_31|        CA|
|       4| 98|         3|      9|  9|   user_9|        MI|
|       5| 91|         4|     28| 28|  user_28|        CA|
|       6| 18|         4|     28| 28|  user_28|        CA|
|       7| 69|         3|     69| 69|  user_69|        WA|
|       8| 66|         4|     78| 78|  user_78|        CO|
|       9| 57|         1|      7|  7|   user_7|        WA|
|      10| 79|         4|     32| 32|  user_32|        CO|
|      11| 46|         2|     22| 22|  user_22|        AZ|
|      12| 38|         2|     38| 38|  user_38|        NY|
|      13| 94|         1|     83| 83|  user_83|        N

In [7]:
joinedDF.explain()

== Physical Plan ==
*(2) BroadcastHashJoin [user_id#41], [uid#16], Inner, BuildRight
:- *(2) Project [order_id#38, qty#39, product_id#40, user_id#41]
:  +- *(2) Filter isnotnull(user_id#41)
:     +- FileScan csv [order_id#38,qty#39,product_id#40,user_id#41] Batched: false, DataFilters: [isnotnull(user_id#41)], Format: CSV, Location: InMemoryFileIndex[file:/home/andras/ipython_spark/spark_training_baseline/06_joins/orders.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<order_id:string,qty:string,product_id:string,user_id:string>
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#103]
   +- *(1) Project [uid#16, user_name#17, user_state#18]
      +- *(1) Filter isnotnull(uid#16)
         +- FileScan csv [uid#16,user_name#17,user_state#18] Batched: false, DataFilters: [isnotnull(uid#16)], Format: CSV, Location: InMemoryFileIndex[file:/home/andras/ipython_spark/spark_training_baseline/06_joins/users.csv], PartitionFilter

In [10]:
total_users = 100000

states = ['WA', 'CA', 'NY', 'MI', 'AZ', 'CO']
products = ['ITEM1', 'ITEM2', 'ITEM3', 'ITEM4', 'ITEM5']

import random
users = sc.parallelize(range(total_users)) \
    .map(lambda id: (id, 'user_' + str(id), states[random.randrange(len(states))])) \
    .toDF(['uid', 'user_name', 'user_state'])

users.write.format('csv').mode('overwrite').option('header', 'true').save('user_big')

In [11]:
joinedDF = orders.join(users, users.uid == orders.user_id)
joinedDF.show()

+--------+---+----------+-------+---+---------+----------+
|order_id|qty|product_id|user_id|uid|user_name|user_state|
+--------+---+----------+-------+---+---------+----------+
|      90| 55|         1|     26| 26|  user_26|        CA|
|     107| 65|         2|     26| 26|  user_26|        CA|
|     498| 74|         0|     26| 26|  user_26|        CA|
|     536| 38|         4|     26| 26|  user_26|        CA|
|    1015| 88|         4|     26| 26|  user_26|        CA|
|    1060| 61|         2|     26| 26|  user_26|        CA|
|    1300| 82|         1|     26| 26|  user_26|        CA|
|    1395| 92|         2|     26| 26|  user_26|        CA|
|    1789| 54|         2|     26| 26|  user_26|        CA|
|    1919| 67|         3|     26| 26|  user_26|        CA|
|    2127| 41|         2|     26| 26|  user_26|        CA|
|    2251| 21|         4|     26| 26|  user_26|        CA|
|    2476| 74|         1|     26| 26|  user_26|        CA|
|    2491|  3|         3|     26| 26|  user_26|        C

In [14]:
joinedDF.explain()

== Physical Plan ==
*(5) SortMergeJoin [cast(user_id#41 as bigint)], [uid#190L], Inner
:- *(2) Sort [cast(user_id#41 as bigint) ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(cast(user_id#41 as bigint), 200), true, [id=#300]
:     +- *(1) Project [order_id#38, qty#39, product_id#40, user_id#41]
:        +- *(1) Filter isnotnull(user_id#41)
:           +- FileScan csv [order_id#38,qty#39,product_id#40,user_id#41] Batched: false, DataFilters: [isnotnull(user_id#41)], Format: CSV, Location: InMemoryFileIndex[file:/home/andras/ipython_spark/spark_training_baseline/06_joins/orders.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<order_id:string,qty:string,product_id:string,user_id:string>
+- *(4) Sort [uid#190L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(uid#190L, 200), true, [id=#306]
      +- *(3) Filter isnotnull(uid#190L)
         +- *(3) Scan ExistingRDD[uid#190L,user_name#191,user_state#192]




In [25]:
# How to speed up?
from pyspark.sql.functions import asc

users.orderBy(asc('uid')) \
  .write.format('csv') \
  .bucketBy(6, 'uid') \
  .mode('overwrite') \
  .saveAsTable('user_sorted')



In [26]:
orders.orderBy(asc('user_id')) \
  .write.format('csv') \
  .bucketBy(6, 'user_id') \
  .mode('overwrite') \
  .saveAsTable('orders_sorted')

In [28]:
spark.sql('cache table user_sorted')
spark.sql('cache table orders_sorted')
users_sorted = spark.table('user_sorted')
orders_sorted = spark.table('orders_sorted')

In [29]:
joinedDF = orders_sorted.join(users_sorted, users_sorted.uid == orders_sorted.user_id)
joinedDF.explain()

== Physical Plan ==
*(2) BroadcastHashJoin [cast(user_id#930 as bigint)], [uid#788L], Inner, BuildRight
:- *(2) Filter isnotnull(user_id#930)
:  +- Scan In-memory table `default`.`orders_sorted` [order_id#927, qty#928, product_id#929, user_id#930], [isnotnull(user_id#930)]
:        +- InMemoryRelation [order_id#927, qty#928, product_id#929, user_id#930], StorageLevel(disk, memory, deserialized, 1 replicas)
:              +- FileScan csv default.orders_sorted[order_id#927,qty#928,product_id#929,user_id#930] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/andras/ipython_spark/spark_training_baseline/06_joins/spark-warehous..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<order_id:string,qty:string,product_id:string,user_id:string>, SelectedBucketsCount: 6 out of 6
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false])), [id=#690]
   +- *(1) Filter isnotnull(uid#788L)
      +- Scan In-memory table `default`.`user_