In [1]:
from datetime import date

from config import (
    MINIO_ACCESS_KEY,
    MINIO_SECRET_KEY,
    MINIO_SERVER_HOST,
)
from data_config import TABLE_MAPPINGS
from delta.tables import *
from pyspark.sql import SparkSession
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import avg, col, conv, expr, from_json, hex, rank, substring
from pyspark.sql.functions import sum as _sum
from pyspark.sql.functions import when
from pyspark.sql.streaming import DataStreamReader, StreamingQuery
from pyspark.sql.types import DoubleType, StringType
from pyspark.sql.utils import AnalysisException
from util.convert_timestamp import convert_timestamp
from util.logger import logger

In [2]:
spark = (
    SparkSession.builder.config(
        "spark.jars.packages",
        "io.delta:delta-core_2.12:2.0.0,"
        "org.apache.spark:spark-avro_2.12:3.2.0,"
        "org.apache.hadoop:hadoop-aws:3.2.3,"
        "com.amazonaws:aws-java-sdk:1.11.375,"
        "org.apache.spark:spark-tags_2.12:3.2.0,"
    )
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.sql.codegen.wholeStage", "false")
    .config(
        "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"
    )
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_SERVER_HOST)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider",
    )
    .config("spark.databricks.delta.optimize.repartition.enabled", "true")
    .getOrCreate()
)

23/05/14 17:19:26 WARN Utils: Your hostname, ducdn-G3-3579 resolves to a loopback address: 127.0.1.1; using 192.168.2.108 instead (on interface wlo1)
23/05/14 17:19:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ducdn/Documents/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ducdn/.ivy2/cache
The jars for the packages stored in: /home/ducdn/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk added as a dependency
org.apache.spark#spark-tags_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-43932c39-d5aa-42ee-b11d-87bcf271ec7d;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.0.0 in central
	found io.delta#delta-storage;2.0.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
	found org.apache.spark#spark-avro_2.12;3.2.0 in central
	found org.tukaani#xz;1.8 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-aws;3.2.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found com.amazonaws#aws-java-sdk;1.11.

In [3]:
df = spark.range(0, 5)
df.show(5)

                                                                                

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [4]:
df.write.format("delta").save("s3a://test/table")

23/05/02 14:45:11 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [9]:
deltaTable = DeltaTable.forPath(spark, "s3a://test/table")

fullHistoryDF = deltaTable.history()

fullHistoryDF.show()

+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      1|2023-05-02 14:48:33|  null|    null| OPTIMIZE|{predicate -> [],...|null|    null|     null|          0|SnapshotIsolation|        false|{numRemovedFiles ...|        null|Apache-Spark/3.2....|
|      0|2023-05-02 14:45:13|  null|    null|    WRITE|{mode -> ErrorIfE...|null|    null|     null|       null|     Serializable|         true|{numFiles -> 6, n...|        null|Apache-Spark/3.2....|


In [6]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "s3a://test/table")

delta_table.optimize().executeCompaction()

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint>]

In [8]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
delta_table.vacuum(0)



Deleted 6 files and directories in a total of 1 directories.


                                                                                

DataFrame[]

In [10]:
df = spark.read.format("delta").load("s3a://test/table")
df.show(truncate=False)

+---+
|id |
+---+
|2  |
|0  |
|4  |
|3  |
|1  |
+---+



In [12]:
df = spark.range(5, 10)
df.write.format("delta").mode("append").save("s3a://test/table")

                                                                                

In [13]:
df = spark.range(10, 15)
df.write.format("delta").mode("append").save("s3a://test/table")

                                                                                

In [30]:
deltaTable = DeltaTable.forPath(spark, "s3a://test/table")

fullHistoryDF = deltaTable.history()

fullHistoryDF.show()

+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      6|2023-05-02 15:30:48|  null|    null|    WRITE|{mode -> Append, ...|null|    null|     null|          5|     Serializable|         true|{numFiles -> 8, n...|        null|Apache-Spark/3.2....|
|      5|2023-05-02 15:30:05|  null|    null|    WRITE|{mode -> Append, ...|null|    null|     null|          4|     Serializable|         true|{numFiles -> 6, n...|        null|Apache-Spark/3.2....|


In [15]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "s3a://test/table")

delta_table.optimize().executeCompaction()

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint>]

In [16]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
delta_table.vacuum(0)

                                                                                

Deleted 13 files and directories in a total of 1 directories.


DataFrame[]

In [18]:
df = spark.read.format("delta").load("s3a://test/table")
df.show(truncate=False)

+---+
|id |
+---+
|2  |
|0  |
|4  |
|3  |
|1  |
|5  |
|7  |
|13 |
|9  |
|8  |
|14 |
|11 |
|6  |
|12 |
|10 |
+---+



In [5]:

df_res = (
    spark.read
    .format("delta")
    .option("versionAsOf", 5)
    .load("s3a://test/table")
)
df_res.show(truncate=False)

+---+-------------+----+-------+-----------+---------+----------+---------+----+-----+---+
|op |ts_ms        |id  |user_id|payment    |status_id|created_at|before_id|year|month|day|
+---+-------------+----+-------+-----------+---------+----------+---------+----+-----+---+
|r  |1683042218095|134 |3407   |instalment |3        |1681541690|134      |2023|4    |15 |
|r  |1683042218104|581 |7688   |cash       |1        |1681549670|581      |2023|4    |15 |
|r  |1683042218105|611 |9557   |credit_card|4        |1681549670|611      |2023|4    |15 |
|r  |1683042218105|672 |3749   |credit_card|3        |1681549670|672      |2023|4    |15 |
|r  |1683042218113|1145|3495   |instalment |3        |1681549670|1145     |2023|4    |15 |
|r  |1683042218115|1226|3738   |credit_card|1        |1681549670|1226     |2023|4    |15 |
|r  |1683042218116|1319|2782   |cash       |3        |1681549670|1319     |2023|4    |15 |
|r  |1683042218119|1522|7449   |instalment |1        |1681549670|1522     |2023|4    |15 |

In [29]:
df = spark.range(10, 20)
df.write.format("delta").mode("append").save("s3a://test/table")

                                                                                

In [36]:
data = [
    ("1", "partition 1"),
    ("2", "partition 2"),
    ("3", "partition 3")
]

schema = StructType([
    StructField("value", StringType(), True),
    StructField("partition", StringType(), True),
])

df = spark.createDataFrame(data, schema)
df.show()
df.write.mode("append").format("delta").partitionBy("partition").save("s3a://test/table1")

+-----+-----------+
|value|  partition|
+-----+-----------+
|    1|partition 1|
|    2|partition 2|
|    3|partition 3|
+-----+-----------+



                                                                                

In [39]:
data = [
    ("10", "partition 1"),
    ("11", "partition 2"),
    ("12", "partition 3")
]

schema = StructType([
    StructField("value", StringType(), True),
    StructField("partition", StringType(), True),
])

df = spark.createDataFrame(data, schema)
df.show()
df.write.mode("append").format("delta").partitionBy("partition").save("s3a://test/table1")

+-----+-----------+
|value|  partition|
+-----+-----------+
|   10|partition 1|
|   11|partition 2|
|   12|partition 3|
+-----+-----------+



                                                                                

In [3]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "s3a://datalake/sliver/cdc.myshop.orders")

delta_table.optimize().executeCompaction()

23/05/02 23:43:11 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
23/05/02 23:43:24 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
23/05/02 23:43:24 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
23/05/02 23:43:25 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint>]

In [4]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
delta_table.vacuum(0)

                                                                                

Deleted 543 files and directories in a total of 324 directories.


DataFrame[]

In [44]:
delta_table

<delta.tables.DeltaTable at 0x7ff9889f5e70>

In [47]:
df.show()

+-----+-----------+
|value|  partition|
+-----+-----------+
|   10|partition 1|
|   11|partition 2|
|   12|partition 3|
+-----+-----------+



In [49]:

df_res = (
    spark.read
    .format("delta")
    .load("s3a://test/table1")
)
df_res.show(truncate=False)

+-----+-----------+
|value|partition  |
+-----+-----------+
|12   |partition 3|
|9    |partition 3|
|6    |partition 3|
|3    |partition 3|
|11   |partition 2|
|8    |partition 2|
|2    |partition 2|
|5    |partition 2|
|10   |partition 1|
|4    |partition 1|
|7    |partition 1|
|1    |partition 1|
+-----+-----------+



In [50]:
df.printSchema()

root
 |-- value: string (nullable = true)
 |-- partition: string (nullable = true)



In [51]:
deltaTable.optimize().executeZOrderBy("value")

IllegalArgumentException: Z-Ordering column value does not exist in data schema.