## ![Delta Lake Tiny Logo](https://pages.databricks.com/rs/094-YMS-629/images/delta-lake-tiny-logo.png) Delta Lake 
Incrementally processing new data files as they land on a cloud blob store is a common flow in ETL workloads. 
  * Option#1 is to use <b> file-based </b> structured streaming, which identifies new files by repeatedly listing the cloud directory and tracking what files have been seen, which can be significantly inefficient as the directory grows. 
  * Option#2 is to use <b>file-notification-based </b> structured streaming (for S3 and Azure Blob Storage), which requires manual configuration. 
  * Option#3  is using <b> Auto Data Loader </b> that can automatically and efficiently process new data files as they land on the cloud blob store. All the user has to provide is the source directory path and start a streaming query using the <em> Cloud File Source </em>

In [2]:
%run "./Include"

### Data Ingestion

In [4]:
df_direct_s3 = (spark
      .readStream
      .option("sep", " ")
      .schema(sensor_schema)
      .csv("/mnt/delta-stream/sensor")
     )

In [5]:
df_s3_sqs = spark.readStream \
  .format("s3-sqs") \
  .option("fileFormat", "csv") \
  .option("sep", " ") \
  .option("queueUrl", sqsUrl) \
  .schema(sensor_schema) \
  .load()

In [6]:
#dbutils.fs.mount("s3a://delta-autoloader","/mnt/delta-autoload")
df_autoloader = (spark.readStream.format("cloudFiles")
      .option("cloudFiles.format", "csv")
      .option("cloudFiles.region", "us-west-2")
      .option("sep", " ")
      .schema(sensor_schema)
      .load("/mnt/delta-autoload/sensor")
     )

In [7]:
# These are the 3 options, use the one that you have configured for in your specific environment
# Option1: df = df_direct_s3  -- read directly from S3
# Option2: df = df_s3_sqs     -- read from s3-sqs connector
# Option3: df = df_autoloader -- use the autoloader on the Databricks platform that automatically provisions the environment for SQS/SNS

df = df_s3_sqs
display(df)

unit_num,cycle_time,ops_1,ops_2,ops_3,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,s_10,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
5,154,-0.0019,-0.0001,100.0,518.67,642.24,1581.11,1394.88,14.62,21.61,554.47,2387.99,9072.68,1.3,47.44,521.91,2388.02,8152.63,8.4069,0.03,394.0,2388.0,100.0,39.13,23.4176
5,155,0.0005,-0.0002,100.0,518.67,642.32,1584.17,1399.99,14.62,21.61,554.56,2388.01,9079.84,1.3,47.32,521.83,2387.99,8155.6,8.4703,0.03,393.0,2388.0,100.0,38.76,23.3817
5,156,-0.0012,-0.0003,100.0,518.67,642.24,1590.06,1401.26,14.62,21.61,553.05,2388.05,9077.82,1.3,47.37,522.19,2388.03,8155.77,8.4005,0.03,393.0,2388.0,100.0,38.99,23.2654
5,157,0.0006,0.0001,100.0,518.67,642.13,1589.32,1404.11,14.62,21.61,554.02,2388.02,9076.39,1.3,47.44,521.76,2388.0,8159.66,8.412,0.03,393.0,2388.0,100.0,39.05,23.3554
5,158,-0.0001,-0.0005,100.0,518.67,641.88,1592.3,1400.2,14.62,21.61,553.82,2388.06,9078.0,1.3,47.23,521.6,2388.0,8162.37,8.4106,0.03,393.0,2388.0,100.0,38.96,23.2331
5,159,-0.0007,-0.0001,100.0,518.67,642.42,1585.25,1410.83,14.62,21.61,553.99,2387.99,9079.84,1.3,47.45,521.64,2388.04,8149.84,8.3972,0.03,392.0,2388.0,100.0,39.09,23.3098
5,160,0.0022,0.0004,100.0,518.67,642.28,1589.77,1407.02,14.62,21.61,553.84,2388.02,9082.33,1.3,47.11,521.65,2388.01,8156.5,8.4285,0.03,394.0,2388.0,100.0,38.7,23.3987
5,161,0.0019,-0.0005,100.0,518.67,642.04,1587.84,1408.28,14.62,21.61,553.85,2387.97,9076.65,1.3,47.47,522.19,2388.04,8154.94,8.4065,0.03,392.0,2388.0,100.0,38.94,23.3001
5,162,-0.0017,0.0004,100.0,518.67,642.07,1586.84,1401.45,14.62,21.61,554.61,2388.05,9078.05,1.3,47.29,522.25,2388.0,8160.48,8.4047,0.03,392.0,2388.0,100.0,38.9,23.3782
5,163,-0.001,0.0004,100.0,518.67,642.63,1594.42,1403.54,14.62,21.61,553.72,2388.03,9072.23,1.3,47.49,521.72,2388.0,8164.66,8.4168,0.03,392.0,2388.0,100.0,38.94,23.2216


In [8]:
(df
  .writeStream                                          # Write the stream
  .format("delta")                                      # Use the delta format
  .option("checkpointLocation", bronzeCheckpointPath)   # Specify where to log metadata
  .option("path", bronzeOutPath)                        # Specify the output path
  .outputMode("append")                                 # Append new records to the output path
  .queryName("delta_bronze")                            # The name of the stream
  .start()                                              # Start the operation
)

In [9]:
bronze_query = 'CREATE TABLE IF NOT EXISTS {}.bronze_stream USING delta LOCATION "{}"'.format(databaseName,bronzeOutPath)
print(bronze_query)
sqlContext.sql(bronze_query)

<a href="$./1a-Read Bronze">Link to 1a-Read Bronze</a>

In [11]:
#[q.stop() for q in spark.streams.active]