In [41]:
import pyspark
from delta import *

In [43]:
builder = pyspark.sql.SparkSession.builder.appName("MyApp3") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

ConnectionRefusedError: [Errno 111] Connection refused

In [19]:
data_path = '/home/donghee/work/spark-3.5.0-bin-hadoop3/data/delta-table-create'

# Create Table

In [24]:
create_sql = "CREATE TABLE IF NOT EXISTS delta.`/home/donghee/work/spark-3.5.0-bin-hadoop3/data/delta-table-create` ( \
	`key` STRING, \
	`value` STRING, \
	`topic` STRING, \
	`timestamp` TIMESTAMP, \
    `date` STRING \
) \
USING DELTA \
PARTITIONED BY (date) \
LOCATION '/home/donghee/work/spark-3.5.0-bin-hadoop3/data/delta-table-create' \
TBLPROPERTIES ( \
    'delta.compatibility.symlinkFormatManifest.enabled'='true' \
)" 

In [25]:
spark_created_sql = spark.sql(create_sql)

                                                                                

In [26]:
df = spark.read.format("delta").load(data_path)
df.show()

+---+-----+-----+---------+----+
|key|value|topic|timestamp|date|
+---+-----+-----+---------+----+
+---+-----+-----+---------+----+



# Basic

In [27]:
data_path = "/home/donghee/work/spark-3.5.0-bin-hadoop3/data/delta-table-quickstart"

data = spark.range(0,5)

#### Write Data

In [31]:
data = spark.range(0,5)
data.collect()

[Row(id=0), Row(id=1), Row(id=2), Row(id=3), Row(id=4)]

In [30]:
data.write.format("delta").save(data_path)

                                                                                

#### Read Data

In [32]:
df = spark.read.format("delta").load(data_path)
df.show()

+---+
| id|
+---+
|  3|
|  0|
|  4|
|  1|
|  2|
+---+



#### Update Data

In [34]:
data = spark.range(30, 40)
data.write.format("delta").mode("overwrite").save(data_path)

read_df = spark.read.format("delta").load(data_path)
read_df.show()

                                                                                

+---+
| id|
+---+
| 39|
| 31|
| 36|
| 35|
| 38|
| 33|
| 32|
| 34|
| 37|
| 30|
+---+



#### Time Travel
- 데이터를 변경하면 자동으로 version이 생성
- 데이터를 예전 버전으로 되돌리기 가능
- 데이터의 보존 기간은 직접 수동을설정 가능
- vaccum 명령어를 사용하며 명시적으로 오래된 버전의 데이터 영구 삭제 가능

In [35]:
data_path = "/home/donghee/work/spark-3.5.0-bin-hadoop3/data/delta-table-timetravel"

data = spark.range(0,5)
data.write.format("delta").save(data_path)

                                                                                

In [38]:
# History select
history = spark.sql("DESCRIBE HISTORY delta.`/home/donghee/work/spark-3.5.0-bin-hadoop3/data/delta-table-timetravel`")
latest_version = history.selectExpr("max(version)").collect()
print("latest_version >>>>>>>>>>>>>>>: {}".format(latest_version))

latest_version >>>>>>>>>>>>>>>: [Row(max(version)=0)]


###### Current Data

In [39]:
print("##### current #####")
df = spark.read.format("delta").load(data_path)
df.show()

print("##### version 0 #####")
df0 = spark.read.format("delta").option("versionAsOf", 0).load(data_path)
df0.show()

##### current #####
+---+
| id|
+---+
|  2|
|  0|
|  4|
|  1|
|  3|
+---+

##### version 0 #####
+---+
| id|
+---+
|  2|
|  0|
|  4|
|  1|
|  3|
+---+



###### Update Data

In [40]:
data = spark.range(30, 40)
data.write.format("delta").mode("overwrite").save(data_path)

print("########### version 0 ###########")
updatedf = spark.read.format("delta").option("versionAsOf", 0).load(data_path)
updatedf.show()

print("########### current  version 1 ###########")
updatedf1 = spark.read.format("delta").option("versionAsOf", 1).load(data_path)
updatedf1.show()

ConnectionRefusedError: [Errno 111] Connection refused