In [1]:
from pyspark.sql.avro.functions import to_avro
from pyspark.sql.functions import from_json, col, expr, struct
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, LongType, DoubleType, IntegerType

In [2]:
spark = SparkSession \
        .builder \
        .appName("Multi Query Demo") \
        .master("local[3]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .getOrCreate()

In [3]:
schema = StructType([
        StructField("id", StringType()),
        StructField("name", StringType()),
        StructField("age", IntegerType()),
        StructField("address", StringType()),
        StructField("email", StringType()),
        StructField("CreatedTime", LongType())        
        ])

In [4]:
kafka_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "consumer-data") \
        .option("startingOffsets", "earliest") \
        .load()

In [5]:
value_df = kafka_df.select(from_json(col("value").cast("string"), schema).alias("value"))

In [6]:
exp_df = value_df.selectExpr("value.id", "value.name", "value.age", "value.address", "value.email", "value.CreatedTime")

In [7]:
exp_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- email: string (nullable = true)
 |-- CreatedTime: long (nullable = true)



In [8]:
kafka_target_df = exp_df.select(expr("id as key"), to_avro(struct("*")).alias("value"))

In [None]:
#para archivos siempre debe ser append
invoice_writer_query = kafka_target_df \
        .writeStream \
        .queryName("Flattened Customer Data") \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("topic", "customer-data-out") \
        .outputMode("append") \
        .option("checkpointLocation", "./checkpoints/customer-data-avro") \
        .start()

In [None]:
invoice_writer_query.awaitTermination()