# Libraries


In [2]:
from pyspark.sql import SparkSession, window
from delta import configure_spark_with_delta_pip

# Spark Session


In [3]:
builder = SparkSession.builder.config(
    "spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension"
).config(
    "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
)


spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
mydf = spark.sql(
    """
select 5 as col1, 6 as col2 union select 7 as col1, 8 as col2;
"""
)

In [5]:
# mydf.write.format('delta').mode('overwrite').save("./output/testdeltatable")

In [6]:
dtname = "delta.`F:\\development\\learn_spark\\output\\testdeltatable`"

spark.sql(f"describe history {dtname};").show(truncate=False)

+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation|operationParameters                   |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                           |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+---------------------------------------------------------------------------------------------------

# Fetch Specific Version from delta


In [7]:
spark.read.format("delta").option("versionAsOf", 0).load(
    "./output/testdeltatable/"
).show()

+----+----+
|col1|col2|
+----+----+
|   1|   2|
|   3|   4|
+----+----+



In [8]:
spark.sql(f"select * from {dtname} VERSION AS OF 1;").show(truncate=False)

+----+----+
|col1|col2|
+----+----+
|5   |6   |
|7   |8   |
+----+----+



# Roll back delta table


In [9]:
spark.sql(f"RESTORE TABLE {dtname} VERSION AS OF 0;").show(truncate=False)


# from delta.tables import *
# deltaTable = DeltaTable.forPath(spark, "/path/to/delta/table/")

# deltaTable.restoreToVersion(5)

+------------------------+--------------------------+-----------------+------------------+------------------+-------------------+
|table_size_after_restore|num_of_files_after_restore|num_removed_files|num_restored_files|removed_files_size|restored_files_size|
+------------------------+--------------------------+-----------------+------------------+------------------+-------------------+
|692                     |1                         |0                |0                 |0                 |0                  |
+------------------------+--------------------------+-----------------+------------------+------------------+-------------------+



In [10]:
spark.sql(f"describe history {dtname};").show(truncate=False)

+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation|operationParameters                   |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                           |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+---------------------------------------------------------------------------------------------------

In [11]:
spark.sql(f"select * from {dtname} VERSION AS OF 0;").show(truncate=False)

+----+----+
|col1|col2|
+----+----+
|1   |2   |
|3   |4   |
+----+----+



# Delta table merge


In [16]:
users_delta = (
    spark.read.format("delta").option("versionAsOf", 1).load("./output/users_df")
)
users_updated_delta = (
    spark.read.format("delta")
    .option("versionAsOf", 1)
    .load("./output/users_updated_df")
)

In [17]:
users_delta.show()
users_updated_delta.show()

+---+--------+--------------+-----+
| id|username|         email|phone|
+---+--------+--------------+-----+
|  1|    john|john@gmail.com|12345|
|  2|    jane|jane@gmail.com|45645|
|  3|    rick|rick@gmail.com|78678|
+---+--------+--------------+-----+

+---+--------+----------------+-----+
| id|username|           email|phone|
+---+--------+----------------+-----+
|  1|    john|  john@gmail.com|    0|
|  2|    jane|jane02@gmail.com|12345|
|  3|    rick|  rick@gmail.com|78678|
|  4|    mike|  mike@gmail.com| 9787|
+---+--------+----------------+-----+



In [None]:
users_table = "delta.`F:\\development\\learn_spark\\output\\users_df`"
users_updated_table = "delta.`F:\\development\\learn_spark\\output\\users_updated_df`"

In [24]:
spark.sql(
    f"""

insert into {users_updated_table} (id, username, email, phone)
    VALUES (
        5,
        'some new name',
        'somenewemail@fasdas.com',
        55667
    );

"""
)

DataFrame[]

In [25]:
spark.sql(
    f"""

  MERGE INTO {users_table} as users
  USING {users_updated_table} as usersupd
  ON users.id = usersupd.id
  WHEN MATCHED THEN
    UPDATE SET
      id = usersupd.id,
      username = usersupd.username,
      email = usersupd.email,
      phone = usersupd.phone
  WHEN NOT MATCHED
    THEN INSERT (
      id,
      username,
      email,
      phone
    )
    VALUES (
        usersupd.id,
        usersupd.username,
        usersupd.email,
        usersupd.phone
    );

    """
).show(truncate=False)

+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|5                |4               |0               |1                |
+-----------------+----------------+----------------+-----------------+



In [26]:
spark.sql(f"select * from {users_table};").show(truncate=False)

+---+-------------+-----------------------+-----+
|id |username     |email                  |phone|
+---+-------------+-----------------------+-----+
|1  |john         |john@gmail.com         |0    |
|2  |jane         |jane02@gmail.com       |12345|
|3  |rick         |rick@gmail.com         |78678|
|4  |mike         |mike@gmail.com         |9787 |
|5  |some new name|somenewemail@fasdas.com|55667|
+---+-------------+-----------------------+-----+



In [27]:
spark.sql(f"describe history {users_table};").show(truncate=False)

+-------+-----------------------+------+--------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation|operationParameters                                                   