In [79]:
from delta.tables import DeltaTable
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, StructField, DateType, TimestampType, StringType, LongType, IntegerType, FloatType
from pyspark.sql.functions import col, when, lit, from_json, count, sum, min, max, avg, desc

In [78]:
# example 11-5
bronze_table_path = '/opt/spark/work-dir/data/delta/bronze_raw/'
dt_bronze_source: DeltaTable = DeltaTable.forPath(spark, bronze_table_path)


In [44]:
bronze_df = dt_bronze_source.toDF()
bronze_df.limit(1).show()

+----+--------------------+--------------------+--------------------+----------+
| key|               value|               topic|           timestamp|event_date|
+----+--------------------+--------------------+--------------------+----------+
|null|[7B 22 65 76 65 6...|ecomm.v1.clickstream|2023-10-08 22:09:...|2023-10-08|
+----+--------------------+--------------------+--------------------+----------+



In [45]:
bronze_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- event_date: date (nullable = true)



## Testing Transformations in Batch
> Tip: Before we dive into full end-to-end streaming, we can attack the problem in batch.

Once things are working as expected. We can consolidate the transformation steps and logic into a `transform` method with the signature `def transform_[some_qualifier](df: DataFrame) -> DataFrame:`. This lets us easily apply specific transformations in our pipelines that are a) testable, and b) easy to follow if looking directly at the pipeline.

~~~python
medallion_stream = (
    delta_source.readStream
    .options(**reader_options)
    .format("delta")
    .load()
    .transform(transform_from_json)
    .transform(transform_for_silver)
    .writeStream
    .format("delta")
    .options(**writer_options))

streaming_query = medallion_stream.toTable(f"{managed_table_name}")

spark.streams.awaitAnyTermination()
    
~~~

In [52]:
## Supply the ECommerce Schema for Permissive Passthrough
# from example 11-4

"""
public static final RowType ECOMMERCE_ROW_TYPE = new RowType(
          Arrays.asList(
                  new RowType.RowField("event_time", new VarCharType(VarCharType.MAX_LENGTH)),
                  new RowType.RowField("event_type", new VarCharType(VarCharType.MAX_LENGTH)),
                  new RowType.RowField("product_id", new IntType()),
                  new RowType.RowField("category_id", new BigIntType()),
                  new RowType.RowField("category_code", new VarCharType(VarCharType.MAX_LENGTH)),
                  new RowType.RowField("brand", new VarCharType(VarCharType.MAX_LENGTH)),
                  new RowType.RowField("price", new FloatType()),
                  new RowType.RowField("user_id", new IntType()),
                  new RowType.RowField("user_session", new VarCharType(VarCharType.MAX_LENGTH))
          ));
"""

known_schema: StructType = (
    StructType()
    .add(StructField("event_time", StringType()))
    .add(StructField("event_type", StringType()))
    .add(StructField("product_id", IntegerType()))
    .add(StructField("category_id", LongType()))
    .add(StructField("category_code", StringType()))
    .add(StructField("brand", StringType()))
    .add(StructField("price", FloatType()))
    .add(StructField("user_id", IntegerType()))
    .add(StructField("user_session", StringType()))
    .add(StructField("_corrupt", StringType(), True, {
    'comment': 'invalid rows go into _corrupt rather than simply being dropped'
    }))
)

def transform_from_json(input_df: DataFrame) -> DataFrame:
    return (
        input_df.withColumn(
            "ecomm", from_json(
                col("value").cast(StringType()),
                known_schema,
                options={
                    'mode': 'PERMISSIVE',
                    'columnNameOfCorruptRecord': '_corrupt'
                }
            )))

transformed_df = bronze_df.transform(transform_from_json)

In [60]:
transformed_df.show()

+----+--------------------+--------------------+--------------------+----------+--------------------+
| key|               value|               topic|           timestamp|event_date|               ecomm|
+----+--------------------+--------------------+--------------------+----------+--------------------+
|null|[7B 22 65 76 65 6...|ecomm.v1.clickstream|2023-10-08 22:07:...|2023-10-08|{2023-08-30T00:00...|
|null|[7B 22 65 76 65 6...|ecomm.v1.clickstream|2023-10-08 22:09:...|2023-10-08|{2023-09-21T00:00...|
+----+--------------------+--------------------+--------------------+----------+--------------------+



In [73]:
def transform_for_silver(input_df: DataFrame) -> DataFrame:
    return (
        input_df
        .select(
            col("event_date").alias("ingest_date"),
            col("timestamp").alias("ingest_timestamp"),
            col("ecomm.*")
        )
        .where(col("_corrupt").isNull())
        .drop("_corrupt")
    )

for_silver_df = (
    bronze_df
    .transform(transform_from_json)
    .transform(transform_for_silver))

In [75]:
for_silver_df.show()

+-----------+--------------------+--------------------+----------+----------+-------------------+--------------------+-------+------+-------+--------------------+
|ingest_date|    ingest_timestamp|          event_time|event_type|product_id|        category_id|       category_code|  brand| price|user_id|        user_session|
+-----------+--------------------+--------------------+----------+----------+-------------------+--------------------+-------+------+-------+--------------------+
| 2023-10-08|2023-10-08 22:07:...|2023-08-30T00:00:00Z|      view|      4782|2053013552326770905|appliances.enviro...|heater3|2789.0|    195|19ae88e1-4a02-4b5...|
| 2023-10-08|2023-10-08 22:09:...|2023-09-21T00:00:00Z|      view|      4783|2051113552326770905|appliances.televi...|   sony|2789.0|    196|19ae88e1-4a02-4b5...|
+-----------+--------------------+--------------------+----------+----------+-------------------+--------------------+-------+------+-------+--------------------+



In [84]:
# example 11-7
topN = (
    for_silver_df
    .groupBy("ingest_date", "category_id")
    .agg(
        count(col("product_id")).alias("impressions"),
        min(col("price")).alias("min_price"),
        max(col("price")).alias("max_price"),
        avg(col("price")).alias("avg_price")
    )
    .orderBy(desc("impressions"))
    .limit(5)
)

In [83]:
topN.show()

+-----------+-------------------+-----------+---------+---------+---------+
|ingest_date|        category_id|impressions|min_price|max_price|avg_price|
+-----------+-------------------+-----------+---------+---------+---------+
| 2023-10-08|2053013552326770905|          1|   2789.0|   2789.0|   2789.0|
| 2023-10-08|2051113552326770905|          1|   2789.0|   2789.0|   2789.0|
+-----------+-------------------+-----------+---------+---------+---------+

