In [1]:
import pyspark
from delta import *
from pyspark.sql.types import StructType

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

22/06/03 14:07:43 WARN Utils: Your hostname, Matthews-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.2 instead (on interface en0)
22/06/03 14:07:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/powers/.sdkman/candidates/spark/3.2.0/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/powers/.ivy2/cache
The jars for the packages stored in: /Users/powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6fdd8347-a3b1-4cef-983d-1e94026cdebd;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.2.1 in central
	found io.delta#delta-storage;1.2.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 317ms :: artifacts dl 23ms
	:: modules in use:
	io.delta#delta-core_2.12;1.2.1 from central in [default]
	io.delta#delta-storage;1.2.1 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|

In [4]:
schema = (
    StructType()
    .add("student_name", "string")
    .add("graduation_year", "string")
    .add("major", "string")
)

In [5]:
def with_normalized_names(df):
    split_col = pyspark.sql.functions.split(df["student_name"], "XX")
    return (
        df.withColumn("first_name", split_col.getItem(0))
        .withColumn("last_name", split_col.getItem(1))
        .drop("student_name")
    )

## Trigger Once Incremental Updates

In [134]:
! mkdir data/tmp_students_incremental

In [135]:
! cp data/students/students1.csv data/tmp_students_incremental

In [136]:
df = (
    spark.readStream.schema(schema)
    .option("header", True)
    .csv("data/tmp_students_incremental")
)

In [137]:
def perform_trigger_once_update():
    checkpointPath = "data/tmp_students_checkpoint/"
    deltaPath = "data/tmp_students_delta"
    return (
        df.transform(lambda df: with_normalized_names(df))
        .writeStream.trigger(once=True)
        .format("delta")
        .option("checkpointLocation", checkpointPath)
        .start(deltaPath)
    )

In [138]:
perform_trigger_once_update()

22/06/03 12:30:45 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x167a6e1c0>

                                                                                

In [139]:
spark.read.format("delta").load(deltaPath).show()

+---------------+-------+----------+---------+
|graduation_year|  major|first_name|last_name|
+---------------+-------+----------+---------+
|           2023|   math|      some|   person|
|           2025|physics|        li|      yao|
+---------------+-------+----------+---------+



In [140]:
! cp data/students/students2.csv data/tmp_students_incremental

In [141]:
perform_trigger_once_update()

22/06/03 12:31:02 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x167a68ac0>

                                                                                

In [142]:
spark.read.format("delta").load(deltaPath).show()

+---------------+-------+----------+---------+
|graduation_year|  major|first_name|last_name|
+---------------+-------+----------+---------+
|           2022|    bio|    sophia|     raul|
|           2025|physics|      fred|       li|
|           2023|   math|      some|   person|
|           2025|physics|        li|      yao|
+---------------+-------+----------+---------+



In [143]:
! cp data/students/students3.csv data/tmp_students_incremental

In [144]:
perform_trigger_once_update()

22/06/03 12:31:13 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x167ab6eb0>

                                                                                

In [145]:
spark.read.format("delta").load(deltaPath).show()

+---------------+-------+----------+---------+
|graduation_year|  major|first_name|last_name|
+---------------+-------+----------+---------+
|           2025|    bio|     chris|     borg|
|           2026|physics|     david|    cross|
|           2022|    bio|    sophia|     raul|
|           2025|physics|      fred|       li|
|           2023|   math|      some|   person|
|           2025|physics|        li|      yao|
+---------------+-------+----------+---------+



## Clean up directories

In [146]:
! rm -rf data/tmp_students_checkpoint
! rm -rf data/tmp_students_delta
! rm -rf data/tmp_students_incremental

22/06/03 13:48:20 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?
22/06/03 13:48:22 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?
22/06/03 13:48:24 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?
22/06/03 13:48:26 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?
22/06/03 13:48:28 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?


# ProcessingTime trigger with two-seconds micro-batch interval

In [6]:
! mkdir data/tmp_students_incremental

In [7]:
! cp data/students/students1.csv data/tmp_students_incremental

In [8]:
df = (
    spark.readStream.schema(schema)
    .option("header", True)
    .csv("data/tmp_students_incremental")
)

In [9]:
checkpointPath = "data/tmp_students_checkpoint/"
deltaPath = "data/tmp_students_delta"

In [10]:
df.transform(lambda df: with_normalized_names(df)).writeStream.trigger(
    processingTime="2 seconds"
).format("delta").option("checkpointLocation", checkpointPath).start(deltaPath)

22/06/03 13:53:07 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x163995a60>

[Stage 2:>                                                         (0 + 8) / 50]

In [11]:
spark.read.format("delta").load(deltaPath).show()

22/06/03 13:53:25 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 16670 milliseconds
                                                                                

+---------------+-------+----------+---------+
|graduation_year|  major|first_name|last_name|
+---------------+-------+----------+---------+
|           2023|   math|      some|   person|
|           2025|physics|        li|      yao|
+---------------+-------+----------+---------+



In [124]:
! cp data/students/students2.csv data/tmp_students_incremental

In [125]:
spark.read.format("delta").load(deltaPath).show()

+---------------+-------+----------+---------+
|graduation_year|  major|first_name|last_name|
+---------------+-------+----------+---------+
|           2023|   math|      some|   person|
|           2025|physics|        li|      yao|
|           2022|    bio|    sophia|     raul|
|           2025|physics|      fred|       li|
+---------------+-------+----------+---------+



In [126]:
! cp data/students/students3.csv data/tmp_students_incremental

In [128]:
spark.read.format("delta").load(deltaPath).show()

+---------------+-------+----------+---------+
|graduation_year|  major|first_name|last_name|
+---------------+-------+----------+---------+
|           2025|    bio|     chris|     borg|
|           2026|physics|     david|    cross|
|           2023|   math|      some|   person|
|           2025|physics|        li|      yao|
|           2022|    bio|    sophia|     raul|
|           2025|physics|      fred|       li|
+---------------+-------+----------+---------+



In [12]:
! rm -rf data/tmp_students_checkpoint
! rm -rf data/tmp_students_delta
! rm -rf data/tmp_students_incremental

22/06/03 14:03:28 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?
22/06/03 14:03:30 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?
22/06/03 14:03:32 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?
22/06/03 14:03:34 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?
22/06/03 14:03:36 WARN HadoopFSUtils: The directory file:/Users/powers/Documents/code/my_apps/delta-examples/notebooks/data/tmp_students_incremental was not found. Was it deleted very recently?
22/06/03 14:03:38 WARN HadoopF

## Read streaming CSV data directly

In [6]:
! mkdir data/tmp_students_incremental

In [7]:
! cp data/students/students1.csv data/tmp_students_incremental

In [8]:
df = (
    spark.readStream.schema(schema)
    .option("header", True)
    .csv("data/tmp_students_incremental")
)

In [11]:
checkpointPath = "data/tmp_students_checkpoint/"

In [15]:
df.transform(lambda df: with_normalized_names(df)).writeStream.format(
    "console"
).trigger(processingTime='1 seconds').option("checkpointLocation", checkpointPath).start()

22/06/03 14:15:28 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
22/06/03 14:15:28 WARN StreamingQueryManager: Stopping existing streaming query [id=3cb90a75-80d0-4d39-ad88-ca1e6bd7c525, runId=7fd74226-4536-4b34-a44d-a3a7c1d70326], as a new run is being started.


<pyspark.sql.streaming.StreamingQuery at 0x164e621c0>

In [16]:
! cp data/students/students2.csv data/tmp_students_incremental

-------------------------------------------
Batch: 1
-------------------------------------------
+---------------+-------+----------+---------+
|graduation_year|  major|first_name|last_name|
+---------------+-------+----------+---------+
|           2022|    bio|    sophia|     raul|
|           2025|physics|      fred|       li|
+---------------+-------+----------+---------+



In [17]:
! cp data/students/students3.csv data/tmp_students_incremental

-------------------------------------------
Batch: 2
-------------------------------------------
+---------------+-------+----------+---------+
|graduation_year|  major|first_name|last_name|
+---------------+-------+----------+---------+
|           2025|    bio|     chris|     borg|
|           2026|physics|     david|    cross|
+---------------+-------+----------+---------+

