## [Storage Partitioned Join, a.k.a Shuffle-less join in Apache Spark](https://www.guptaakashdeep.com/storage-partition-join-in-apache-spark-why-how-and-where/)

In [None]:
import findspark
findspark.init()
findspark.find()

In [None]:
from pyspark.sql import SparkSession, Row

# update here the required versions
SPARK_VERSION = "3.5"
ICEBERG_VERSION = "1.5.0"
CATALOG_NAME = "local"
# update this to your local path where you want tables to be created
DW_PATH = "/path/to/local/warehouse"

spark = SparkSession.builder \
    .master("local[4]") \
    .appName("spj-iceberg") \
    .config("spark.sql.adaptive.enabled", "true")\
    .config('spark.jars.packages', f'org.apache.iceberg:iceberg-spark-runtime-{SPARK_VERSION}_2.12:{ICEBERG_VERSION},org.apache.spark:spark-avro_2.12:3.5.0')\
    .config('spark.sql.extensions','org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')\
    .config(f'spark.sql.catalog.{CATALOG_NAME}','org.apache.iceberg.spark.SparkCatalog') \
    .config(f'spark.sql.catalog.{CATALOG_NAME}.type','hadoop') \
    .config(f'spark.sql.catalog.{CATALOG_NAME}.warehouse',DW_PATH) \
    .config('spark.sql.autoBroadcastJoinThreshold', '-1')\
    .enableHiveSupport()\
    .getOrCreate()

## Preparing Data
- `Customers` table: 
    - partitioned on:  `region`.
    - primary key: `customer_id`.
- `Orders` table:
    - partitioned on `region`.
    - primary key: `orders_id`
    - foreign key for Customers: `customer_id`

And some other random details.

In [None]:
%pip install faker

In [None]:
from faker import Faker
import random


# Initialize Faker
fake = Faker()
Faker.seed(42)

# Generate customer data
def generate_customer_data(num_customers=1000):
    regions = ['North', 'South', 'East', 'West']
    
    customers = []
    for _ in range(num_customers):
        signup_date = fake.date_time_between(start_date='-3y', end_date='now')
        customers.append(Row(
            customer_id=fake.unique.random_number(digits=6),
            customer_name=fake.name(),
            region=random.choice(regions),
            signup_date=signup_date,
            signup_year=signup_date.year  # Additional column for partition evolution
        ))
    
    return spark.createDataFrame(customers)

# Generate order data
def generate_order_data(customer_df, num_orders=5000):
    customers = [row.customer_id for row in customer_df.select('customer_id').collect()]
    
    orders = []
    for _ in range(num_orders):
        order_date = fake.date_time_between(start_date='-3y', end_date='now')
        orders.append(Row(
            order_id=fake.unique.random_number(digits=8),
            customer_id=random.choice(customers),
            order_date=order_date,
            amount=round(random.uniform(10, 1000), 2),
            region=random.choice(['North', 'South', 'East', 'West']),
            order_year=order_date.year  # Additional column for partition evolution
        ))
    
    return spark.createDataFrame(orders)

# Generate the data
print("Generating sample data...")
customer_df = generate_customer_data(1000)
order_df = generate_order_data(customer_df, 50000)

customer_df.show(5, truncate=False)
order_df.show(5, truncate=False)



In [None]:
# DROPPING table if already exists -- in case it needs to rerun
spark.sql("DROP TABLE IF EXISTS local.db.customers")
spark.sql("DROP TABLE IF EXISTS local.db.orders")

### Writing data to create `Customers` and `Orders` Iceberg Table

In [None]:
# Initially create Iceberg tables with region partitioning only
print("\nCreating initial Iceberg tables with region partitioning...")
customer_df.writeTo("local.db.customers") \
    .tableProperty("format-version", "2") \
    .partitionedBy("region") \
    .create()

order_df.writeTo("local.db.orders") \
    .tableProperty("format-version", "2") \
    .partitionedBy("region") \
    .create()

In [None]:
# Show create table structure
print("\nCustomer Table Structure:")
spark.sql("show create TABLE local.db.customers").show(truncate=False)

print("\nOrder Table Structure:")
spark.sql("show create TABLE local.db.orders").show(truncate=False)

# Storage Partitioned Join
- Avoids any shuffle during Joins based on Storage layout and partition information shared by DS v2 sources like Iceberg Tables.

### All SPJ configurations to handle all the cases:
- Version mentioned at the EOL

```python
spark.conf.set('spark.sql.iceberg.planning.preserve-data-grouping','true') # Spark 3.3
spark.conf.set('spark.sql.sources.v2.bucketing.enabled','true') # Spark 3.3
spark.conf.set('spark.sql.sources.v2.bucketing.pushPartValues.enabled','true') # Spark 3.4
spark.conf.set('spark.sql.requireAllClusterKeysForCoPartition','false') # Spark 3.4
spark.conf.set('spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enabled','true') # Spark 3.4
```

In [None]:
from pyspark.sql.functions import col

CUSTOMERS_TABLE = f'{CATALOG_NAME}.db.customers'
ORDERS_TABLE = f'{CATALOG_NAME}.db.orders'

# Reading from table
cust_df = spark.table(CUSTOMERS_TABLE)
order_df = spark.table(ORDERS_TABLE)

In [None]:
# joining dataframes only on parititioned columns -- SPJ not enabled
joined_df = cust_df.join(order_df, on="region", how="left")
joined_df.explain("FORMATTED")

# Shows Exchange Node visible in the plan.

## Scenario 1: Join Keys same as Partition Keys

In [None]:
# Enabling minimal configuration for SPJ. executing the same join again
spark.conf.set('spark.sql.sources.v2.bucketing.enabled','true')
spark.conf.set('spark.sql.iceberg.planning.preserve-data-grouping','true')

joined_df = cust_df.join(order_df, on="region", how="left")
joined_df.explain("FORMATTED")

# No EXCHANGE Node in the plan

## Scenario 2:  Partitions from both side doesn't match

In [None]:
# Dropping one partition data to create a missing value partition
spark.sql(f"DELETE FROM {ORDERS_TABLE} where region='West'").show()

In [None]:
# Ensuring the West partition data is deleted
order_df = spark.table(ORDERS_TABLE)
order_df.groupBy("region").count().show()

In [None]:
# Executing the join with non matching number of partitions
joined_df = cust_df.join(order_df, on=['region'], how='left')
joined_df.explain("FORMATTED")

# Exchange node is back..!!!!

In [None]:
# Executing the join with pushPartValues.enabled along with min configuration
# spark.conf.set('spark.sql.sources.v2.bucketing.enabled','true')
# spark.conf.set('spark.sql.iceberg.planning.preserve-data-grouping','true')

spark.conf.set('spark.sql.sources.v2.bucketing.pushPartValues.enabled','true')
joined_df = cust_df.join(order_df, on='region', how='left')
joined_df.explain("FORMATTED")

# SPJ Works after enabling the pushPartValues -- No Exchange Node

## Scenario 3: Join Keys do not match the Partition Keys
- 3.1 : Join Keys are superset of Partition Keys
- 3.2 : Join Keys are subset of Partition Keys

### Scenario 3.1: Join Keys are superset of Partition Keys
- Joining on `region` that is partition key and `customer_key` non-partition key.

In [None]:
# Adding one more join column other than the partitioned column
joined_df = cust_df.join(order_df, on=['region','customer_id'], how='left')
joined_df.explain("FORMATTED")

# Exchange Node is back again !!

In [None]:
# Setting up another config to support SPJ for these cases
spark.conf.set('spark.sql.requireAllClusterKeysForCoPartition','false')
joined_df = cust_df.join(order_df, on=['region','customer_id'], how='left')
joined_df.explain("FORMATTED")

# SPJ works after disabling the configuration.

### Scenario 3.2: Join Keys are subset of Partition Keys -- **SPJ DOESN'T work in `Spark < 4.0`**
Only works if all the partition keys are part of join.

- Different bucketing
    - `Customers` partitioned by (region, bucket(2, customer_id))
    - `Orders` partitioned by (region, bucket(4, customer_id))

- Same bucketing
    - `Customers` partitioned by (region, bucket(4, customer_id))
    - `Orders` partitioned by (region, bucket(4, customer_id))

- Hidden Partitions with same bucketing
    - `Customers` partitioned by (region, year(signup_date), bucket(4, customer_id))
    - `Orders` partitioned by (region, year(order_date), bucket(4, customer_id))

In [66]:
spark.sql("DROP TABLE IF EXISTS local.db.customers_buck")
spark.sql("DROP TABLE IF EXISTS local.db.orders_buck")

DataFrame[]

#### Hidden Partitioning with same bucketing

In [None]:
# Customers Table with buckets
spark.sql(f"""CREATE TABLE local.db.customers_buck (
          customer_id BIGINT,
          customer_name STRING,
          region STRING,
          signup_date TIMESTAMP
          )
          USING iceberg
          PARTITIONED BY (region, year(signup_date), bucket(4, customer_id))
          TBLPROPERTIES (
            'format' = 'iceberg/parquet',
            'format-version' = '2',
            'write.parquet.compression-codec' = 'zstd'
          )
""")

# Orders Table with buckets
spark.sql("""CREATE TABLE local.db.orders_buck (
            order_id BIGINT,
            customer_id BIGINT,
            order_date TIMESTAMP,
            amount DOUBLE,
            region STRING
          )
          USING iceberg
          PARTITIONED BY (region, year(order_date), bucket(4, customer_id))
          TBLPROPERTIES (
            'format' = 'iceberg/parquet',
            'format-version' = '2',
            'write.parquet.compression-codec' = 'zstd'
          )
""")

DataFrame[]

In [68]:
# Writing into table
CUSTOMERS_BUCK_TABLE = 'local.db.customers_buck'
ORDERS_BUCK_TABLE = 'local.db.orders_buck'
# spark.sql(f"desc table {CUSTOMERS_BUCK_TABLE}").show(truncate=False)
spark.table(CUSTOMERS_BUCK_TABLE).show(truncate=False)

+-----------+-------------+------+-----------+
|customer_id|customer_name|region|signup_date|
+-----------+-------------+------+-----------+
+-----------+-------------+------+-----------+



In [69]:
# Generating data and writing into table
customer_df = generate_customer_data(10000)
order_df = generate_order_data(customer_df, 500000)
cust_col_order = spark.table(CUSTOMERS_BUCK_TABLE).columns
customer_df.select(*cust_col_order).writeTo(CUSTOMERS_BUCK_TABLE).append()
orders_col_order = spark.table(ORDERS_BUCK_TABLE).columns
order_df.select(*orders_col_order).writeTo(ORDERS_BUCK_TABLE).append()

24/11/27 18:23:12 WARN TaskSetManager: Stage 32 contains a task of very large size (4390 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [74]:
# Setting all SPJ configs available in Spark 3.4.0
spark.conf.set('spark.sql.sources.v2.bucketing.enabled','true')
spark.conf.set('spark.sql.iceberg.planning.preserve-data-grouping','true')
spark.conf.set('spark.sql.sources.v2.bucketing.pushPartValues.enabled','true')
spark.conf.set('spark.sql.requireAllClusterKeysForCoPartition','false')
spark.conf.set('spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enabled', 'true')

In [None]:
cust_buck_df = spark.table(CUSTOMERS_BUCK_TABLE)
order_buck_df = spark.table(ORDERS_BUCK_TABLE)

# SPJ works only when all the columns are present in join.
cust_buck_df.alias("cust").join(order_buck_df.alias("order"), 
                                on=[col('signup_date') == col('order_date'),
                                    col("cust.customer_id") == col("order.customer_id"),
                                    col("cust.region") == col("order.region")
                                    ], how='left').explain("FORMATTED")

# SPJ Works when all the columns are present in JOIN Condition

In [None]:
# Setting all SPJ configs available in Spark 3.4.0
spark.conf.set('spark.sql.sources.v2.bucketing.enabled','true')
spark.conf.set('spark.sql.iceberg.planning.preserve-data-grouping','true')
spark.conf.set('spark.sql.sources.v2.bucketing.pushPartValues.enabled','true')
spark.conf.set('spark.sql.requireAllClusterKeysForCoPartition','false')
spark.conf.set('spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enabled', 'true')

cust_buck_df.join(order_buck_df, on=['region', 'customer_id'], how='left').explain("FORMATTED")

# SPJ DOESN'T work if all the columns are not in join condition for Spark < 4.0
# Exchange Nodes present in plan

#### Different bucketing
- Creating 2 tables:
    - `Customers` partitioned by (region, bucket(2, customer_id))
    - `Orders` partitioned by (region, bucket(4, customer_id))

In [None]:
spark.sql("DROP TABLE IF EXISTS local.db.customers_buck")
spark.sql("DROP TABLE IF EXISTS local.db.orders_buck")

In [None]:
# Customers Table with buckets
spark.sql(f"""CREATE TABLE local.db.customers_buck (
          customer_id BIGINT,
          customer_name STRING,
          region STRING,
          signup_date TIMESTAMP,
          signup_year BIGINT
          )
          USING iceberg
          PARTITIONED BY (region, bucket(2, customer_id))
          TBLPROPERTIES (
            'format' = 'iceberg/parquet',
            'format-version' = '2',
            'write.parquet.compression-codec' = 'zstd'
          )
""")

# Orders Table with buckets
spark.sql("""CREATE TABLE local.db.orders_buck (
            order_id BIGINT,
            customer_id BIGINT,
            order_date TIMESTAMP,
            amount DOUBLE,
            region STRING,
            order_year BIGINT)
          USING iceberg
          PARTITIONED BY (region, bucket(4, customer_id))
          TBLPROPERTIES (
            'format' = 'iceberg/parquet',
            'format-version' = '2',
            'write.parquet.compression-codec' = 'zstd'
          )
""")

DataFrame[]

In [None]:
# Generating data and writing into table
customer_df = generate_customer_data(10000)
order_df = generate_order_data(customer_df, 500000)

# Writing into table
CUSTOMERS_BUCK_TABLE = 'local.db.customers_buck'
ORDERS_BUCK_TABLE = 'local.db.orders_buck'
cust_col_order = spark.table(CUSTOMERS_BUCK_TABLE).columns
customer_df.select(*cust_col_order).writeTo(CUSTOMERS_BUCK_TABLE).append()
orders_col_order = spark.table(ORDERS_BUCK_TABLE).columns
order_df.select(*orders_col_order).writeTo(ORDERS_BUCK_TABLE).append()

In [None]:
# Reading from table
cust_buck_df = spark.table(CUSTOMERS_BUCK_TABLE)
order_buck_df = spark.table(ORDERS_BUCK_TABLE)

In [None]:
# Setting all SPJ configs available in Spark 3.4.0
spark.conf.set('spark.sql.sources.v2.bucketing.enabled','true')
spark.conf.set('spark.sql.iceberg.planning.preserve-data-grouping','true')
spark.conf.set('spark.sql.sources.v2.bucketing.pushPartValues.enabled','true')
spark.conf.set('spark.sql.requireAllClusterKeysForCoPartition','false')
spark.conf.set('spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enabled', 'true')

joined_buck_df = cust_buck_df.join(order_buck_df, on=['region', 'customer_id'], how='left')
joined_buck_df.explain("FORMATTED")

### Doesn't work -- Exchange nodes are still present. -- Needs to be retested with Spark 4.0
# Spark 4.0 provides an additional config specifically for this: 
# spark.conf.set('spark.sql.sources.v2.bucketing.allowJoinKeysSubsetOfPartitionKeys.enabled', 'true')