In [0]:
%run ./GenerateStreamingData-Autoloader

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Preparing environment

In [0]:
%sql
Use catalog learn_adb_fikrat;
create schema if not exists bronze;
Use bronze;

In [0]:
root_data_folder='/Volumes/learn_adb_fikrat/bronze/landing/autoloader'
sensor_data_folder=f'{root_data_folder}/sensor'
checkpoint_root_path = "/Volumes/learn_adb_fikrat/bronze/ext_landing_volume/streaming-checkpoints/autoloader"

checkpoint_path_sensor=f'{checkpoint_root_path}/iot_measurements'
schema_path_sensor=f'{checkpoint_root_path}/iot_measurements/schema'
checkpoint_path_sensor1=f'{checkpoint_root_path}/iot_measurements1'
checkpoint_path_sensor2=f'{checkpoint_root_path}/iot_measurements2'
checkpoint_path_sensor3=f'{checkpoint_root_path}/iot_measurements3'
checkpoint_path_sensor4=f'{checkpoint_root_path}/iot_measurements4'
# checkpoint_path_sensor5=f'{checkpoint_root_path}/iot_measurements5'
# checkpoint_path_sensor6=f'{checkpoint_root_path}/iot_measurements6'
# checkpoint_path_sensor7=f'{checkpoint_root_path}/iot_measurements7'
# checkpoint_path_sensor8=f'{checkpoint_root_path}/iot_measurements8'
# checkpoint_path_weather=f'{checkpoint_root_path}/weather'


In [0]:
def drop_all_tables():
    spark.sql("DROP TABLE IF EXISTS bronze.iot_measurements_autoloader")
    spark.sql("DROP TABLE IF EXISTS bronze.iot_measurements_autoloader1")
    spark.sql("DROP TABLE IF EXISTS bronze.iot_measurements_autoloader2")
    spark.sql("DROP TABLE IF EXISTS bronze.iot_measurements_autoloader3")
    spark.sql("DROP TABLE IF EXISTS bronze.iot_measurements_autoloader4")

def reset_checkpoints():
    dbutils.fs.rm(checkpoint_path_sensor, True)
    dbutils.fs.rm(schema_path_sensor, True)
    dbutils.fs.rm(checkpoint_path_sensor1, True)
    dbutils.fs.rm(checkpoint_path_sensor2, True)
    dbutils.fs.rm(checkpoint_path_sensor3, True)
    dbutils.fs.rm(checkpoint_path_sensor4, True)

In [0]:
def reset_environment(root_data_folder):
    drop_all_tables()
    reset_checkpoints()
    prepare_data(root_data_folder)

#Basic streaming operations: read and write

### Basic read/write

In [0]:
reset_environment(root_data_folder)
gernerate_persist_streaming_data(datetime(2025, 1, 3),2,root_data_folder)

In [0]:
dfSensor = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .load(sensor_data_folder)
# display(dfSensor)    

In [0]:
dfs = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .load(sensor_data_folder)\
    .writeStream\
    .option("checkpointLocation", checkpoint_path_sensor)\
    .toTable('bronze.iot_measurements_autoloader')    

In [0]:
%sql
select * from bronze.iot_measurements_autoloader  --where `_rescued_data` is not null;
-- select min(EventTime), max(EventTime), count(*) from bronze.iot_measurements_autoloader

In [0]:
gernerate_persist_streaming_data_enhanced(datetime(2025, 1, 8),3,root_data_folder)

## Schema evolution

Options:
- addNewColumns (default)
- rescue
- failOnNewColumns
- none

### addNewColumns option

In [0]:
reset_environment(root_data_folder)
gernerate_persist_streaming_data(datetime(2025, 1, 3),2,root_data_folder)

In [0]:
dfSensor = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .option("cloudFiles.inferColumnTypes", True)\
    .option("cloudFiles.schemaEvolutionMode","addNewColumns")\
    .load(sensor_data_folder)\
    .writeStream\
    .option("checkpointLocation", checkpoint_path_sensor1)\
    .option("mergeSchema",True) \
    .toTable('bronze.iot_measurements_autoloader1') 

In [0]:
%sql
select * from bronze.iot_measurements_autoloader1  --where `_rescued_data` is not null;

In [0]:
gernerate_persist_streaming_data_enhanced(datetime(2025, 1, 8),1,root_data_folder)

### failOnNewColumns option

In [0]:
reset_environment(root_data_folder)
gernerate_persist_streaming_data(datetime(2025, 1, 3),1,root_data_folder)

In [0]:
dfSensor = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .option("cloudFiles.schemaEvolutionMode","failOnNewColumns")\
    .load(sensor_data_folder)\
    .writeStream\
    .option("checkpointLocation", checkpoint_path_sensor2)\
    .option("mergeSchema",True) \
    .toTable('bronze.iot_measurements_autoloader2') 

In [0]:
%sql
select * from bronze.iot_measurements_autoloader2

In [0]:
gernerate_persist_streaming_data_enhanced(datetime(2025, 1, 8),1,root_data_folder)

### Rescue option

In [0]:
reset_environment(root_data_folder)
gernerate_persist_streaming_data(datetime(2025, 1, 3),1,root_data_folder)

In [0]:
spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .option("cloudFiles.schemaEvolutionMode","rescue")\
    .load(sensor_data_folder)\
    .writeStream\
    .option("checkpointLocation", checkpoint_path_sensor3)\
    .option("mergeSchema",True) \
    .toTable('bronze.iot_measurements_autoloader3') 

In [0]:
%sql
select * from bronze.iot_measurements_autoloader3 --where `_rescued_data`is not null

In [0]:
gernerate_persist_streaming_data_enhanced(datetime(2025, 1, 8),1,root_data_folder)

### None option

In [0]:
reset_environment(root_data_folder)
gernerate_persist_streaming_data(datetime(2025, 1, 3),1,root_data_folder)

In [0]:
spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .option("cloudFiles.schemaEvolutionMode","none")\
    .load(sensor_data_folder)\
    .writeStream\
    .option("checkpointLocation", checkpoint_path_sensor4)\
    .option("mergeSchema",True) \
    .toTable('bronze.iot_measurements_autoloader4') 

In [0]:
%sql
select * from bronze.iot_measurements_autoloader4 

In [0]:
gernerate_persist_streaming_data_enhanced(datetime(2025, 1, 8),1,root_data_folder)

## Inferring column data types

In [0]:
gernerate_persist_streaming_data_enhanced(datetime(2025, 1, 8),1,root_data_folder)

In [0]:
dfSensor = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .option("cloudFiles.schemaEvolutionMode","rescue")\
    .option("cloudFiles.inferColumnTypes",False)\
    .load(sensor_data_folder)
display(dfSensor)    

In [0]:
dfSensor = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .option("cloudFiles.schemaEvolutionMode","rescue")\
    .option("cloudFiles.inferColumnTypes",True)\
    .load(sensor_data_folder)
display(dfSensor)    

In [0]:
dfSensor = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .option("cloudFiles.schemaEvolutionMode","rescue")\
    .option("cloudFiles.schemaHints","EventTime timestamp,Pressure double")\
    .load(sensor_data_folder)
display(dfSensor)    

## Common configuration options

- cloudFiles.includeExistingFiles
- modifiedAfter,modifiedBefore

In [0]:
dfSensor = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .option("cloudFiles.includeExistingFiles", False)\
    .option("cloudFiles.schemaEvolutionMode","rescue")\
    .load(sensor_data_folder)
display(dfSensor)    

In [0]:
gernerate_persist_streaming_data_enhanced(datetime(2025, 1, 8),1,root_data_folder)

In [0]:
dfSensor = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "json")\
    .option("cloudFiles.schemaLocation", schema_path_sensor)\
    .option("modifiedBefore", "2025-01-23T09:00:00.000 UTC-5:00")\
    .option("cloudFiles.schemaEvolutionMode","rescue")\
    .load(sensor_data_folder)\
    .filter(col("EventTime").cast("timestamp") <datetime(2025, 1,8))
display(dfSensor)    