# Stream Customers Data From Cloud Files to Delta Lake
1. Read files from cloud storage using DataStreamReader API
2. Transform the dataframe to add the following columns
    - file path: Cloud file path
    - ingestion date: Current Timestamp
3. Write the transformed data stream to Delta Lake Table

In [0]:
customers_df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", "/Volumes/gizmobox/landing/operational_data/customers_autoloader")
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.schemaHints", "date_of_birth DATE, member_since DATE, created_timestamp TIMESTAMP")
    .load("/Volumes/gizmobox/landing/operational_data/customers_autoloader")
)

In [0]:
from pyspark.sql.functions import col, current_timestamp

customers_transformed_df = customers_df.withColumn(
    "file_path", col("_metadata.file_path")
).withColumn("ingestion_timestamp", current_timestamp())

In [0]:
streaming_query = (
    customers_transformed_df.writeStream.format("delta")
    .option(
        "checkpointLocation",
        "/Volumes/gizmobox/landing/operational_data/customers_autoloader/_checkpoints",
    )
    .toTable("gizmobox.bronze.customers_autoloader")
)

In [0]:
streaming_query.stop()

In [0]:
%sql
SELECT
  *
FROM
  gizmobox.bronze.customers_autoloader;