# Set up local SparkSession

In [1]:
from datetime import date

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.14.0")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
    .config("spark.sql.catalog.spark_catalog.type", "hive")
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.local.type", "hadoop")
    .config("spark.sql.catalog.local.warehouse", "spark-warehouse")
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.repl.eagerEval.maxNumRows", 10)
    .config("spark.sql.repl.eagerEval.truncate", 500)
    .getOrCreate()
)

22/08/12 16:56:31 WARN Utils: Your hostname, HF-LPT-1115A.local resolves to a loopback address: 127.0.0.1; using 192.168.178.21 instead (on interface en0)
22/08/12 16:56:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/guido.kosloffgancedo/Git/pp/jupyter-demo/.venv/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/guido.kosloffgancedo/.ivy2/cache
The jars for the packages stored in: /Users/guido.kosloffgancedo/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.2_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-531e0b36-6c8b-4db4-8170-44eba398aedc;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.14.0 in central
:: resolution report :: resolve 110ms :: artifacts dl 2ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.14.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	--------------------------------------------------------------------

# Read flights data from CSV

In [3]:
df = (
    spark
    .read
    .option("header", True)
    .option("inferSchema", True)
    .csv("flights.csv")
)
df

year,month,day,departure_time,arrival_time,carrier,flight_number,aircraft_registration,flight_time,airport_origin,airport_destination,distance,is_cancelled,is_diverted
2008,1,3,1343,1451,WN,588,N240WN,68,HOU,LIT,393,0,0
2008,1,3,1125,1247,WN,1343,N523SW,82,HOU,MAF,441,0,0
2008,1,3,2009,2136,WN,3841,N280WN,87,HOU,MAF,441,0,0
2008,1,3,903,1203,WN,3,N308SA,120,HOU,MCO,848,0,0
2008,1,3,1423,1726,WN,25,N462WN,123,HOU,MCO,848,0,0
2008,1,3,2024,2325,WN,51,N483WN,121,HOU,MCO,848,0,0
2008,1,3,1753,2053,WN,940,N493WN,120,HOU,MCO,848,0,0
2008,1,3,622,935,WN,2621,N266WN,133,HOU,MCO,848,0,0
2008,1,3,1944,2210,WN,389,N266WN,146,HOU,MDW,937,0,0
2008,1,3,1453,1716,WN,519,N514SW,143,HOU,MDW,937,0,0


# Perform some simple aggregations and transformations

In [4]:
df = (
    df
    .withColumn("date", F.to_date(F.concat("year", F.lit("-"), "month", F.lit("-"), "day")))
    .where("is_cancelled = 0 and is_diverted = 0")
    .groupBy("date", "airport_origin", "airport_destination")
    .agg({"*": "count", "flight_time": "avg"})
    .withColumnRenamed("count(1)", "count_flights")
    .withColumnRenamed("avg(flight_time)", "avg_flight_time")
    .withColumn("avg_flight_time", F.col("avg_flight_time").cast("int"))
)
df.orderBy("count_flights", ascending=False)

date,airport_origin,airport_destination,count_flights,avg_flight_time
2008-01-03,LAX,OAK,20,80
2008-01-03,LAS,PHX,18,64
2008-01-03,LAX,LAS,14,63
2008-01-03,LAS,LAX,14,67
2008-01-03,LAS,SAN,14,63
2008-01-03,LAS,BUR,13,62
2008-01-03,LAX,PHX,13,66
2008-01-03,LAX,SJC,13,64
2008-01-03,LAS,RNO,13,74
2008-01-03,LAS,OAK,13,95


# Write to partitioned Iceberg table

In [6]:
(
    df
    .writeTo("local.default.flights")
    .partitionedBy("date")
    .createOrReplace()
)

# Perform updates, deletes and merges on the table

### Update and delete

In [7]:
spark.sql("""
    UPDATE local.default.flights
    SET avg_flight_time = NULL
    WHERE avg_flight_time = 80
""")

spark.sql("""
    DELETE FROM local.default.flights
    WHERE airport_origin = 'LAS' or airport_destination = 'LAS'
""")

spark.sql("""
    SELECT *
    FROM local.default.flights
    ORDER BY count_flights DESC
""")

date,airport_origin,airport_destination,count_flights,avg_flight_time
2008-01-03,LAX,OAK,20,
2008-01-03,LAX,PHX,13,66.0
2008-01-03,LAX,SJC,13,64.0
2008-01-03,MCI,MDW,12,81.0
2008-01-03,MDW,MCI,12,79.0
2008-01-03,HOU,MSY,11,58.0
2008-01-03,MHT,BWI,11,79.0
2008-01-03,LAX,SMF,10,71.0
2008-01-03,MDW,STL,10,59.0
2008-01-03,HRL,HOU,9,54.0


### Merge

In [8]:
incoming_df = spark.createDataFrame([
    {"date": date(2008, 1, 3), "airport_origin": "LAX", "airport_destination": "OAK", "count_flights": 21, "avg_flight_time": 83},
    {"date": date(2008, 1, 4), "airport_origin": "JFK", "airport_destination": "EZE", "count_flights": 22, "avg_flight_time": 660},
])
incoming_df.createOrReplaceTempView("flights_incoming_data")

spark.sql("""
    MERGE INTO local.default.flights t
    USING flights_incoming_data s
    ON t.date = s.date
        AND t.airport_origin = s.airport_origin
        AND t.airport_destination = s.airport_destination
    WHEN MATCHED THEN UPDATE SET t.count_flights = s.count_flights, t.avg_flight_time = s.avg_flight_time
    WHEN NOT MATCHED THEN INSERT *
""")

spark.sql("""
    SELECT *
    FROM local.default.flights
    ORDER BY count_flights DESC
""")

                                                                                

date,airport_origin,airport_destination,count_flights,avg_flight_time
2008-01-04,JFK,EZE,22,660
2008-01-03,LAX,OAK,21,83
2008-01-03,LAX,SJC,13,64
2008-01-03,LAX,PHX,13,66
2008-01-03,MCI,MDW,12,81
2008-01-03,MDW,MCI,12,79
2008-01-03,HOU,MSY,11,58
2008-01-03,MHT,BWI,11,79
2008-01-03,LAX,SMF,10,71
2008-01-03,MDW,STL,10,59


# Time travel

### Query table history and older snapshots

In [9]:
spark.sql("""
    SELECT *
    FROM local.default.flights.history
""")

made_current_at,snapshot_id,parent_id,is_current_ancestor
2022-08-12 16:56:54.701,118092335985285660,,True
2022-08-12 16:57:00.533,8731886491728813616,1.1809233598528566e+17,True
2022-08-12 16:57:01.214,1177357709104922928,8.731886491728813e+18,True
2022-08-12 16:57:05.131,3866957348097064347,1.177357709104923e+18,True


In [10]:
spark.sql("""
    SELECT *
    FROM local.default.flights.snapshots
""")

committed_at,snapshot_id,parent_id,operation,manifest_list,summary
2022-08-12 16:56:53.212,9081973367794722448,,append,spark-warehouse/default/flights/metadata/snap-9081973367794722448-1-d65fa079-2bfe-42af-bc7c-53f552a57102.avro,"{spark.app.id -> local-1660316193504, added-data-files -> 1, added-records -> 243, added-files-size -> 3043, changed-partition-count -> 1, total-records -> 243, total-files-size -> 3043, total-data-files -> 1, total-delete-files -> 0, total-position-deletes -> 0, total-equality-deletes -> 0}"
2022-08-12 16:56:54.701,118092335985285660,,append,spark-warehouse/default/flights/metadata/snap-118092335985285660-1-9fd1e238-15f4-40d2-81c1-127d5111412d.avro,"{spark.app.id -> local-1660316193504, added-data-files -> 1, added-records -> 243, added-files-size -> 3043, changed-partition-count -> 1, total-records -> 243, total-files-size -> 3043, total-data-files -> 1, total-delete-files -> 0, total-position-deletes -> 0, total-equality-deletes -> 0}"
2022-08-12 16:57:00.533,8731886491728813616,1.1809233598528566e+17,overwrite,spark-warehouse/default/flights/metadata/snap-8731886491728813616-1-fef3964a-68d7-4a12-ac84-cdaf0e1ecd6f.avro,"{spark.app.id -> local-1660316193504, added-data-files -> 1, deleted-data-files -> 1, added-records -> 243, deleted-records -> 243, added-files-size -> 3054, removed-files-size -> 3043, changed-partition-count -> 1, total-records -> 243, total-files-size -> 3054, total-data-files -> 1, total-delete-files -> 0, total-position-deletes -> 0, total-equality-deletes -> 0}"
2022-08-12 16:57:01.214,1177357709104922928,8.731886491728813e+18,overwrite,spark-warehouse/default/flights/metadata/snap-1177357709104922928-1-c2872be2-5038-4b1c-9df6-f1fa328ef52a.avro,"{spark.app.id -> local-1660316193504, added-data-files -> 1, deleted-data-files -> 1, added-records -> 178, deleted-records -> 243, added-files-size -> 2751, removed-files-size -> 3054, changed-partition-count -> 1, total-records -> 178, total-files-size -> 2751, total-data-files -> 1, total-delete-files -> 0, total-position-deletes -> 0, total-equality-deletes -> 0}"
2022-08-12 16:57:05.131,3866957348097064347,1.177357709104923e+18,overwrite,spark-warehouse/default/flights/metadata/snap-3866957348097064347-1-db8815a0-882c-48e3-987c-67e3d8068dc5.avro,"{spark.app.id -> local-1660316193504, added-data-files -> 2, deleted-data-files -> 1, added-records -> 179, deleted-records -> 178, added-files-size -> 4280, removed-files-size -> 2751, changed-partition-count -> 2, total-records -> 179, total-files-size -> 4280, total-data-files -> 2, total-delete-files -> 0, total-position-deletes -> 0, total-equality-deletes -> 0}"


In [12]:
(
    spark
    .read
    .option("snapshot-id", "118092335985285660")
    .table("local.default.flights")
    .orderBy("count_flights", ascending=False)
)

date,airport_origin,airport_destination,count_flights,avg_flight_time
2008-01-03,LAX,OAK,20,80
2008-01-03,LAS,PHX,18,64
2008-01-03,LAX,LAS,14,63
2008-01-03,LAS,LAX,14,67
2008-01-03,LAS,SAN,14,63
2008-01-03,LAS,BUR,13,62
2008-01-03,LAX,PHX,13,66
2008-01-03,LAX,SJC,13,64
2008-01-03,LAS,RNO,13,74
2008-01-03,LAS,OAK,13,95


### Rollback to older snapshot

In [13]:
spark.sql("""
    CALL local.system.rollback_to_snapshot(table => 'local.default.flights', snapshot_id => 118092335985285660)
""")

previous_snapshot_id,current_snapshot_id
3866957348097064347,118092335985285660


# Remove old snapshots

In [14]:
spark.sql("""
    CALL local.system.expire_snapshots(table => 'local.default.flights', older_than => TIMESTAMP '2023-01-01', retain_last => 1)
""")

deleted_data_files_count,deleted_position_delete_files_count,deleted_equality_delete_files_count,deleted_manifest_files_count,deleted_manifest_lists_count
5,0,0,7,4


In [15]:
spark.sql("""
    SELECT *
    FROM local.default.flights.snapshots
""")

committed_at,snapshot_id,parent_id,operation,manifest_list,summary
2022-08-12 16:56:54.701,118092335985285660,,append,spark-warehouse/default/flights/metadata/snap-118092335985285660-1-9fd1e238-15f4-40d2-81c1-127d5111412d.avro,"{spark.app.id -> local-1660316193504, added-data-files -> 1, added-records -> 243, added-files-size -> 3043, changed-partition-count -> 1, total-records -> 243, total-files-size -> 3043, total-data-files -> 1, total-delete-files -> 0, total-position-deletes -> 0, total-equality-deletes -> 0}"


# Compact small files

In [16]:
spark.sql("""
    CALL local.system.rewrite_data_files(table => 'local.default.flights')
""")

rewritten_data_files_count,added_data_files_count
0,0
