In [1]:
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import expr, col, struct, to_json, sum

In [2]:
spark = SparkSession \
        .builder \
        .appName("Multi Query Demo") \
        .master("local[3]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .getOrCreate()

In [3]:
kafka_source_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "invoice-items") \
        .option("startingOffsets", "earliest") \
        .load()

In [4]:
avroSchema = open('./schema/invoice-items', mode='r').read()

In [5]:
value_df = kafka_source_df.select(from_avro(col("value"), avroSchema).alias("value"))

In [6]:
value_df.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- InvoiceNumber: string (nullable = true)
 |    |-- CreatedTime: long (nullable = true)
 |    |-- StoreID: string (nullable = true)
 |    |-- PosID: string (nullable = true)
 |    |-- CustomerType: string (nullable = true)
 |    |-- CustomerCardNo: string (nullable = true)
 |    |-- DeliveryType: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- PinCode: string (nullable = true)
 |    |-- ItemCode: string (nullable = true)
 |    |-- ItemDescription: string (nullable = true)
 |    |-- ItemPrice: double (nullable = true)
 |    |-- ItemQty: integer (nullable = true)
 |    |-- TotalValue: double (nullable = true)



In [7]:
rewards_df = value_df.filter("value.CustomerType == 'PRIME'") \
        .groupBy("value.CustomerCardNo") \
        .agg(sum("value.TotalValue").alias("TotalPurchase"),
             sum(expr("value.TotalValue * 0.2").cast("integer")).alias("AggregatedRewards"))

In [8]:
rewards_df.printSchema()

root
 |-- CustomerCardNo: string (nullable = true)
 |-- TotalPurchase: double (nullable = true)
 |-- AggregatedRewards: long (nullable = true)



In [9]:
kafka_target_df = rewards_df.select(expr("CustomerCardNo as key"),
                                        to_json(struct("TotalPurchase", "AggregatedRewards")).alias("value"))

In [10]:
kafka_target_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [11]:
#Operacion, las operaciones anteriores son lazy
#ReadStream es diferente a las opearciones batch con funciones load() y no puedes usar el head() o show()
rewards_writer_query = kafka_target_df \
        .writeStream \
        .queryName("Rewards Writer") \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("topic", "customer-rewards") \
        .outputMode("update") \
        .option("checkpointLocation", "./checkpoints/customer-rewards") \
        .start()

In [None]:
rewards_writer_query.awaitTermination()