In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [85]:
from pyspark.sql.functions import spark_partition_id, explode

## Tip #1

In [56]:
df1 = spark.range(2000000).selectExpr('(3*id) % 5 as column_1')

In [57]:
df2 = spark.range(2000000).selectExpr('id as column_2')

In [58]:
df = df1.join(df2, df1["column_1"]==df2["column_2"])

In [59]:
df.show(3)

+--------+--------+
|column_1|column_2|
+--------+--------+
|       0|       0|
|       0|       0|
|       0|       0|
+--------+--------+
only showing top 3 rows



In [60]:
df.count()

2000000

In [61]:
df.explain()

== Physical Plan ==
*(5) SortMergeJoin [column_1#480L], [column_2#484L], Inner
:- *(2) Sort [column_1#480L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(column_1#480L, 200), true, [id=#1820]
:     +- *(1) Project [((3 * id#478L) % 5) AS column_1#480L]
:        +- *(1) Filter isnotnull(((3 * id#478L) % 5))
:           +- *(1) Range (0, 2000000, step=1, splits=4)
+- *(4) Sort [column_2#484L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(column_2#484L, 200), true, [id=#1826]
      +- *(3) Project [id#482L AS column_2#484L]
         +- *(3) Range (0, 2000000, step=1, splits=4)




## Tip #4

In [123]:
spark.conf.set('spark.sql.shuffle.partitions', 123)

In [124]:
spark.conf.get('spark.sql.shuffle.partitions')

'123'

In [126]:
df = df1.join(df2, df1["column_1"]==df2["column_2"])

**The shuffling is now set to 123 !**

In [127]:
df.explain()

== Physical Plan ==
*(5) SortMergeJoin [column_1#480L], [column_2#484L], Inner
:- *(2) Sort [column_1#480L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(column_1#480L, 123), true, [id=#2448]
:     +- *(1) Project [((3 * id#478L) % 5) AS column_1#480L]
:        +- *(1) Filter isnotnull(((3 * id#478L) % 5))
:           +- *(1) Range (0, 2000000, step=1, splits=4)
+- *(4) Sort [column_2#484L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(column_2#484L, 123), true, [id=#2454]
      +- *(3) Project [id#482L AS column_2#484L]
         +- *(3) Range (0, 2000000, step=1, splits=4)




## Tip #5

In [89]:
bins = 100

In [98]:
array = f"array({', '.join([str(i) for i in range(bins)])})"
df11 = df1.selectExpr('*', f'explode({array}) as salt')
df11.createOrReplaceTempView('df11')

In [95]:
df11.show()

+--------+----+
|column_1|salt|
+--------+----+
|       0|   0|
|       0|   1|
|       0|   2|
|       0|   3|
|       0|   4|
|       0|   5|
|       0|   6|
|       0|   7|
|       0|   8|
|       0|   9|
|       0|  10|
|       0|  11|
|       0|  12|
|       0|  13|
|       0|  14|
|       0|  15|
|       0|  16|
|       0|  17|
|       0|  18|
|       0|  19|
+--------+----+
only showing top 20 rows



In [100]:
salt = f"cast(random()*{bins} as int)"
df22 = df2.selectExpr(f"concat_ws('_', column_2, {salt}) as salted_column_2")
df22.createOrReplaceTempView('df22')

In [101]:
df0 = spark.sql("""
    select *
    from df22 a 
    join df11 b
    on a.salted_column_2 = concat_ws('_', b.column_1, b.salt)
""")

In [102]:
df0.explain()

== Physical Plan ==
*(5) SortMergeJoin [salted_column_2#643], [concat_ws(_, cast(column_1#480L as string), cast(salt#636 as string))], Inner
:- *(2) Sort [salted_column_2#643 ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(salted_column_2#643, 200), true, [id=#2166]
:     +- *(1) Project [concat_ws(_, cast(id#482L as string), cast(cast((random(2636194755330797495) * 100.0) as int) as string)) AS salted_column_2#643]
:        +- *(1) Range (0, 2000000, step=1, splits=4)
+- *(4) Sort [concat_ws(_, cast(column_1#480L as string), cast(salt#636 as string)) ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(concat_ws(_, cast(column_1#480L as string), cast(salt#636 as string)), 200), true, [id=#2173]
      +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,