In [1]:
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import expr, col, struct, to_json, sum

In [2]:
spark = SparkSession \
        .builder \
        .appName("Multi Query Demo") \
        .master("local[3]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .getOrCreate()

In [3]:
kafka_source_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "customer-data-out") \
        .option("startingOffsets", "earliest") \
        .load()

In [4]:
avroSchema = open('./schema/customer-data', mode='r').read()

In [5]:
value_df = kafka_source_df.select(from_avro(col("value"), avroSchema).alias("value"))

In [6]:
value_df.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: integer (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- CreatedTime: long (nullable = true)



In [9]:
rewards_df = value_df.filter("value.age >= '34'") \
        .groupBy("value.address").count().alias("CantidadMayorA34")

In [10]:
rewards_df.printSchema()

root
 |-- address: string (nullable = true)
 |-- count: long (nullable = false)



In [11]:
kafka_target_df = rewards_df.select(expr("address as key"),
                                        to_json(struct("count")).alias("value"))

In [12]:
kafka_target_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [11]:
#Operacion, las operaciones anteriores son lazy
#ReadStream es diferente a las opearciones batch con funciones load() y no puedes usar el head() o show()
rewards_writer_query = kafka_target_df \
        .writeStream \
        .queryName("Rewards Writer") \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("topic", "customer-data-rewards") \
        .outputMode("update") \
        .option("checkpointLocation", "./checkpoints/customer-data-rewards") \
        .start()

In [None]:
rewards_writer_query.awaitTermination()