# Stream Customers Data From Cloud Files to Delta Lake
1. Read files from cloud storage using DataStreamReader API
2. Transform the dataframe to add the following columns
    - file path: Cloud file path
    - ingestion date: Current Timestamp
3. Write the transformed data stream to Delta Lake Table

In [0]:
from pyspark.sql.types import *

customers_schema = StructType(
    fields=[
        StructField("customer_id", StringType()),
        StructField("customer_name", StringType()),
        StructField("date_of_birth", DateType()),
        StructField("telephone", StringType()),
        StructField("email", StringType()),
        StructField("member_since", DateType()),
        StructField("created_timestamp", TimestampType()),
    ]
)

customers_df = (
    spark.readStream.format("json")
    .schema(customers_schema)
    .load("/Volumes/gizmobox/landing/operational_data/customers_stream")
)

In [0]:
from pyspark.sql.functions import col, current_timestamp

customers_transformed_df = customers_df.withColumn(
    "file_path", col("_metadata.file_path")
).withColumn("ingestion_timestamp", current_timestamp())

In [0]:
streaming_query = (
    customers_transformed_df.writeStream.format("delta")
    .option(
        "checkpointLocation",
        "/Volumes/gizmobox/landing/operational_data/customers_stream/_checkpoint_stream",
    )
    .toTable("gizmobox.bronze.customers_stream")
)

In [0]:
streaming_query.stop()

In [0]:
%sql
SELECT
  *
FROM
  gizmobox.bronze.customers_stream;