##Salting##

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [0]:
spark.conf.get('spark.sql.shuffle.partitions')

Out[111]: '4'

In [0]:
spark.conf.set('spark.sql.shuffle.partitions',4)
spark.conf.set('spark.sql.adaptive.enabled',False)
spark.conf.get('spark.sql.shuffle.partitions')

Out[112]: '4'


**##Skewed join**

In [0]:
df_uniform=(spark.createDataFrame([k for k in range(1000000)], IntegerType()))
df_uniform.show(8)

+-----+
|value|
+-----+
|    0|
|    1|
|    2|
|    3|
|    4|
|    5|
|    6|
|    7|
+-----+
only showing top 8 rows



In [0]:
(df_uniform
 .withColumn('part_id',F.spark_partition_id())
 .groupBy('part_id')
 .count()
 .show())

+-------+------+
|part_id| count|
+-------+------+
|      6|124928|
|      2|124928|
|      4|124928|
|      5|124928|
|      0|124928|
|      1|124928|
|      3|124928|
|      7|125504|
+-------+------+



In [0]:
df0=spark.createDataFrame([0] * 999990, IntegerType()).repartition(1)
df1=spark.createDataFrame([1] * 15, IntegerType()).repartition(1)
df2=spark.createDataFrame([2] * 10, IntegerType()).repartition(1)
df3=spark.createDataFrame([3] * 5, IntegerType()).repartition(1)

df_skew=df0.union(df1).union(df2).union(df3)

In [0]:
df_skew.show(5)

+-----+
|value|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 5 rows



In [0]:
(df_skew
 .withColumn('part_id',F.spark_partition_id())
 .groupBy('part_id')
 .count()
 .show())

+-------+------+
|part_id| count|
+-------+------+
|      2|    10|
|      0|999990|
|      1|    15|
|      3|     5|
+-------+------+



In [0]:
df_joined1=(
    df_uniform.join(df_skew,"value",'inner')
)

In [0]:
df_joined1.show(5)

+-----+
|value|
+-----+
|    2|
|    2|
|    2|
|    2|
|    2|
+-----+
only showing top 5 rows



In [0]:
(df_joined1
 .withColumn('part_id',F.spark_partition_id())
 .groupBy('part_id')
 .count()
 .show())

+-------+-------+
|part_id|  count|
+-------+-------+
|      2|     10|
|      3|1000010|
+-------+-------+



In [0]:
##Assign a random number between (0 - saltnumber] to skewd dataset

salt_num=int(spark.conf.get('spark.sql.shuffle.partitions'))
salt_num

Out[121]: 4

In [0]:
df_skew=(
    df_skew.withColumn('salt' , (F.rand() * salt_num).cast('int'))
)

In [0]:
df_skew.show(8)

+-----+----+
|value|salt|
+-----+----+
|    0|   1|
|    0|   0|
|    0|   0|
|    0|   3|
|    0|   0|
|    0|   2|
|    0|   1|
|    0|   0|
+-----+----+
only showing top 8 rows



In [0]:
## Assign an array of 0- saltnum to uniform dataset and explode

df_uniform = (
    df_uniform
    .withColumn('saltarray', F.array([F.lit(i) for i in range(salt_num)]))
    .withColumn('salt', F.explode(F.col('saltarray')))
)


In [0]:
df_uniform.show(5)

+-----+------------+----+
|value|   saltarray|salt|
+-----+------------+----+
|    0|[0, 1, 2, 3]|   0|
|    0|[0, 1, 2, 3]|   1|
|    0|[0, 1, 2, 3]|   2|
|    0|[0, 1, 2, 3]|   3|
|    1|[0, 1, 2, 3]|   0|
+-----+------------+----+
only showing top 5 rows



In [0]:
df_joined2=(
    df_uniform.join(df_skew,["value" , "salt"],'inner')
)

In [0]:
(df_joined2
 .withColumn('part_id',F.spark_partition_id())
 .groupBy('part_id')
 .count()
 .show())

+-------+------+
|part_id| count|
+-------+------+
|      2|500391|
|      0|     3|
|      1|499620|
|      3|     6|
+-------+------+

