# WAP Implementation with Iceberg version > 1.2

- After Iceberg version > 1.2, it supports `branching` and `tagging` snapshots which makes it easier to implement WAP framework

In [None]:
!pip install pyspark==3.5
!pip install findspark

In [1]:
import findspark
findspark.init()
findspark.find()

'/Users/akashdeepgupta/Documents/project-repos/pyspark-playground/venv/lib/python3.8/site-packages/pyspark'

Loading Iceberg jars 
- .config('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.2')

Loading Iceberg Extensions to call stored procedures
- .config('spark.sql.extensions','org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')

Creating Iceberg catalog name of type `hive` that loads table from `Hive Metastore`. Adds support for spark built-in catalog 
- .config('spark.sql.catalog.spark_catalog','org.apache.iceberg.spark.SparkSessionCatalog')
- .config('spark.sql.catalog.spark_catalog.type','hive')

Creating Iceberg catalog named `local` of type `hadoop`. This supports directory based catalog in HDFS
- .config('spark.sql.catalog.local','org.apache.iceberg.spark.SparkCatalog')
- .config('spark.sql.catalog.local.type','hadoop')
- .config('spark.sql.catalog.local.warehouse','<path_to_warehouse>') \

If `type` is `null`, `spark.sql.catalog.<catalog-name>.catalog-impl` **shouldn't** be `null`
    

In [None]:
from pyspark.sql import SparkSession

warehouse_directory = "local_path"

spark = SparkSession.builder \
    .master("local[4]") \
    .appName("wap-iceberg-branching") \
    .config('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.2')\
    .config('spark.sql.extensions','org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions') \
    .config('spark.sql.catalog.spark_catalog','org.apache.iceberg.spark.SparkSessionCatalog') \
    .config('spark.sql.catalog.spark_catalog.type','hive') \
    .config('spark.sql.catalog.local','org.apache.iceberg.spark.SparkCatalog') \
    .config('spark.sql.catalog.local.type','hadoop') \
    .config('spark.sql.catalog.local.warehouse',f'{warehouse_directory}/warehouse') \
    .getOrCreate()

### Reading and creating an Iceberg table

In [3]:
# Reading NYC Yellow Taxi Trip Data Sep 2023 data
yellow_df = spark.read.parquet("../nyc-taxi-trips/yellow/sep-2023/")
yellow_df.count()

2846722

In [10]:
# Creating month and year column
from pyspark.sql.functions import lit
yellow_df = yellow_df.withColumn("month", lit(9)) \
        .withColumn("year", lit(2023))

In [None]:
# creating an Iceberg table in local catalog within nyc_tld database
yellow_df.writeTo("local.nyc_tlc.yellow_taxi_trips").partitionedBy("year", "month").using("iceberg").tableProperty("format-version", "2").create()

In [25]:
table_stmt = spark.sql("show create table local.nyc_tlc.yellow_taxi_trips").toPandas().to_dict(orient="records")[0]['createtab_stmt']

```sql
CREATE TABLE local.nyc_tlc.yellow_taxi_trips (
      VendorID INT,
      tpep_pickup_datetime TIMESTAMP_NTZ,
      tpep_dropoff_datetime TIMESTAMP_NTZ,
      passenger_count BIGINT,
      trip_distance DOUBLE,
      RatecodeID BIGINT,
      store_and_fwd_flag STRING,
      PULocationID INT,
      DOLocationID INT,
      payment_type BIGINT,
      fare_amount DOUBLE,
      extra DOUBLE,
      mta_tax DOUBLE,
      tip_amount DOUBLE,
      tolls_amount DOUBLE,
      improvement_surcharge DOUBLE,
      total_amount DOUBLE,
      congestion_surcharge DOUBLE,
      Airport_fee DOUBLE,
      month INT,
      year INT)
      USING iceberg
      PARTITIONED BY (year, month)
      LOCATION '<warehouse_path>/warehouse/nyc_tlc/yellow_taxi_trips'
      TBLPROPERTIES (
        'current-snapshot-id' = '6410054250659262609',
        'format' = 'iceberg/parquet',
        'format-version' = '2',
        'write.parquet.compression-codec' = 'zstd')


In [35]:
# metdata tables: `refs`, `history`, `manifests`, `partitions`, `snapshots`, `files`
spark.sql("select * from local.nyc_tlc.yellow_taxi_trips.refs").show(truncate=False)

+----+------+-------------------+-----------------------+---------------------+----------------------+
|name|type  |snapshot_id        |max_reference_age_in_ms|min_snapshots_to_keep|max_snapshot_age_in_ms|
+----+------+-------------------+-----------------------+---------------------+----------------------+
|main|BRANCH|6410054250659262609|NULL                   |NULL                 |NULL                  |
+----+------+-------------------+-----------------------+---------------------+----------------------+



## WAP implementation

In [None]:
prod_table = "local.nyc_tlc.yellow_taxi_trips"
audit_branch = f"audit_102023"

# Create an Audit Branch for staging the new data before writing in prod table
spark.sql(f"ALTER TABLE {prod_table} CREATE BRANCH {audit_branch}")

In [37]:
# checking if a new audit branch is created from same snapshot as main branch
spark.sql("select * from local.nyc_tlc.yellow_taxi_trips.refs").show(truncate=False)

+------------+------+-------------------+-----------------------+---------------------+----------------------+
|name        |type  |snapshot_id        |max_reference_age_in_ms|min_snapshots_to_keep|max_snapshot_age_in_ms|
+------------+------+-------------------+-----------------------+---------------------+----------------------+
|main        |BRANCH|6410054250659262609|NULL                   |NULL                 |NULL                  |
|audit_102023|BRANCH|6410054250659262609|NULL                   |NULL                 |NULL                  |
+------------+------+-------------------+-----------------------+---------------------+----------------------+



In [41]:
# Check if the table has WAP enabled, if not enable wap 
spark.sql(f"ALTER TABLE {prod_table} SET TBLPROPERTIES ('write.wap.enabled'='true')")

23/12/25 18:37:33 WARN BaseTransaction: Failed to load metadata for a committed snapshot, skipping clean-up


DataFrame[]

In [42]:
# checking if wap is enabled in table properties
spark.sql("show create table local.nyc_tlc.yellow_taxi_trips").toPandas().to_dict(orient="records")[0]['createtab_stmt']

"CREATE TABLE local.nyc_tlc.yellow_taxi_trips (\n  VendorID INT,\n  tpep_pickup_datetime TIMESTAMP_NTZ,\n  tpep_dropoff_datetime TIMESTAMP_NTZ,\n  passenger_count BIGINT,\n  trip_distance DOUBLE,\n  RatecodeID BIGINT,\n  store_and_fwd_flag STRING,\n  PULocationID INT,\n  DOLocationID INT,\n  payment_type BIGINT,\n  fare_amount DOUBLE,\n  extra DOUBLE,\n  mta_tax DOUBLE,\n  tip_amount DOUBLE,\n  tolls_amount DOUBLE,\n  improvement_surcharge DOUBLE,\n  total_amount DOUBLE,\n  congestion_surcharge DOUBLE,\n  Airport_fee DOUBLE,\n  month INT,\n  year INT)\nUSING iceberg\nPARTITIONED BY (year, month)\nLOCATION '/Users/akashdeepgupta/Documents/project-repos/pyspark-playground/warehouse/nyc_tlc/yellow_taxi_trips'\nTBLPROPERTIES (\n  'current-snapshot-id' = '6410054250659262609',\n  'format' = 'iceberg/parquet',\n  'format-version' = '2',\n  'write.parquet.compression-codec' = 'zstd',\n  'write.wap.enabled' = 'true')\n"

In [43]:
# setting WAP branch to Audit Branch in Spark Session so new data is written into Audit Branch
spark.conf.set("spark.wap.branch", audit_branch)

In [44]:
## Reading new Oct 2023 Yellow Taxi trip data that needs to be written into Audit Branch.
new_data = spark.read.parquet("../nyc-taxi-trips/yellow/oct-2023/")
new_data = new_data.withColumn("year", lit(2023)).withColumn("month", lit(10))
new_data.count()

3522285

In [45]:
## Writing new data into Audit branch of table
new_data.writeTo(prod_table).append()

                                                                                

In [48]:
## Check if the main production table data is not hampered
main_df = spark.table("local.nyc_tlc.yellow_taxi_trips")
main_df.groupBy("year","month").count().show()

+----+-----+-------+
|year|month|  count|
+----+-----+-------+
|2023|   10|3522285|
|2023|    9|2846722|
+----+-----+-------+



Voila..!!! our main table data is impacted after write?!?!?!

Not Really. As we have set `spark.wap.branch` to `audit_branch` when we are reading from table it's gonna read from the same branch.

It's equivalent to reading from `audit_branch` i.e. `spark.read.option("BRANCH", "audit_102023")` or,

in sql terms `SELECT * FROM local.nyc_tlc.yellow_taxi_trips VERSION AS OF 'audit_102023';`

In [51]:
## Reading from `main` production branch "main"
spark.read.option("BRANCH","main").table(prod_table).groupBy("year","month").count().show()

+----+-----+-------+
|year|month|  count|
+----+-----+-------+
|2023|    9|2846722|
+----+-----+-------+



Data in our prod table `main` branch is intact.

## Audit

## Auditing data present in Audit Branch.

Let's assume as per our application data quality standards:
- It shouldn't have any data with `total_amount` as negative.
- The completeness of VendorID should be 1.0 i.e. there shouldn't be any `null` in `VendorID` field.
- `payment_type` should contain only discrete numeric values from 1 to 6

So these are mainly nothing but data quality rules that you can run on the `audit branch` and decide what you want to do with the data for e.g.

- Discard the entire snapshot as the DQ doesn't meet the expectation.
- DELETE the rows not meeting DQ standards and preserve such records somewhere else.
- Fix the data via some logic like populating `null` and `missing` value with some logic.

In [52]:
from pyspark.sql.functions import col
# Running DQ checks as per the requirements using spark itself, 
# It can be anything here: like PyDeequ Verifications, Glue Data Quality Job Runs here.

# Reading data from Audit Branch:
audit_data = spark.read.option("BRANCH", audit_branch).table(prod_table)

# check if there are any rows with negative total_amount
neg_amt_df = audit_data.filter(col("total_amount") < 0)

In [57]:
if not neg_amt_df.isEmpty():
    # write rejected records in some table or someplace before deleting them.
    # neg_amt_df.write.partitionedBy("year","month").parquet("bad-data-location")
    spark.sql(f"DELETE FROM {prod_table} where total_amount < 0")

                                                                                

In [60]:
spark.sql("select * from local.nyc_tlc.yellow_taxi_trips.history").show(truncate=False)

+-----------------------+-------------------+---------+-------------------+
|made_current_at        |snapshot_id        |parent_id|is_current_ancestor|
+-----------------------+-------------------+---------+-------------------+
|2023-12-25 17:37:57.422|6410054250659262609|NULL     |true               |
+-----------------------+-------------------+---------+-------------------+



In [61]:
# Checking data in main branch
spark.read.option("BRANCH","main").table(prod_table).groupBy("year","month").count().show()

+----+-----+-------+
|year|month|  count|
+----+-----+-------+
|2023|    9|2846722|
+----+-----+-------+



In [64]:
# Checking data in audit branch -- changes after few records are deleted as per Auditing
spark.read.option("BRANCH", audit_branch).table(prod_table).groupBy("year","month").count().show()

+----+-----+-------+
|year|month|  count|
+----+-----+-------+
|2023|    9|2817469|
|2023|   10|3485320|
+----+-----+-------+



## Publish

Once the Auditing is done and DQ is as expected or fixed. We can Publish these final records into the Production main branch.

### Before Fast forwarding

In [62]:
#Before Fast forwarding
spark.sql(f"select * from {prod_table}.history").show(truncate=False)

+-----------------------+-------------------+---------+-------------------+
|made_current_at        |snapshot_id        |parent_id|is_current_ancestor|
+-----------------------+-------------------+---------+-------------------+
|2023-12-25 17:37:57.422|6410054250659262609|NULL     |true               |
+-----------------------+-------------------+---------+-------------------+



In [63]:
spark.sql(f"select * from {prod_table}.refs").show(truncate=False)

+------------+------+-------------------+-----------------------+---------------------+----------------------+
|name        |type  |snapshot_id        |max_reference_age_in_ms|min_snapshots_to_keep|max_snapshot_age_in_ms|
+------------+------+-------------------+-----------------------+---------------------+----------------------+
|main        |BRANCH|6410054250659262609|NULL                   |NULL                 |NULL                  |
|audit_102023|BRANCH|3963454346673756003|NULL                   |NULL                 |NULL                  |
+------------+------+-------------------+-----------------------+---------------------+----------------------+



### Fast Forwarding audit branch to main branch

In [68]:
spark.sql(f"""CALL local.system.fast_forward('{prod_table}', 'main', '{audit_branch}')""").show()

+--------------+-------------------+-------------------+
|branch_updated|       previous_ref|        updated_ref|
+--------------+-------------------+-------------------+
|          main|6410054250659262609|3963454346673756003|
+--------------+-------------------+-------------------+



23/12/25 20:23:11 WARN BaseTransaction: Failed to load metadata for a committed snapshot, skipping clean-up


In [69]:
spark.sql(f"select * from {prod_table}.history").show(truncate=False)

+-----------------------+-------------------+-------------------+-------------------+
|made_current_at        |snapshot_id        |parent_id          |is_current_ancestor|
+-----------------------+-------------------+-------------------+-------------------+
|2023-12-25 17:37:57.422|6410054250659262609|NULL               |true               |
|2023-12-25 20:23:10.974|3963454346673756003|9062254653834310700|true               |
+-----------------------+-------------------+-------------------+-------------------+



In [75]:
spark.sql(f"select * from {prod_table}.refs").show(truncate=False)

+------------+------+-------------------+-----------------------+---------------------+----------------------+
|name        |type  |snapshot_id        |max_reference_age_in_ms|min_snapshots_to_keep|max_snapshot_age_in_ms|
+------------+------+-------------------+-----------------------+---------------------+----------------------+
|main        |BRANCH|3963454346673756003|NULL                   |NULL                 |NULL                  |
|audit_102023|BRANCH|3963454346673756003|NULL                   |NULL                 |NULL                  |
+------------+------+-------------------+-----------------------+---------------------+----------------------+



In [72]:
# unsetting wap.branch from the spark session
spark.conf.unset('spark.wap.branch')