Consumindo o contéudo de um tópico do Kafka com Spark Streaming

In [0]:
df = spark.readStream.format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "primeiro_topico") \
        .option("startingOffsets", "earliest") \
        .load()

display(df)

key,value,topic,partition,offset,timestamp,timestampType
,eyJpZCI6IDEsICJub21lIjogIk1hdGhldXMiLCAiaWRhZGUiOiAyN30=,primeiro_topico,0,0,2023-10-04T00:15:11.192+0000,0
,eyJpZCI6IDEsICJub21lIjogIk1hdGhldXMiLCAiaWRhZGUiOiAzMH0=,primeiro_topico,0,1,2023-10-04T00:16:20.777+0000,0
,eyJpZCI6IDEsICJub21lIjogIk1hdGhldXMiLCAiaWRhZGUiOiAzMH0=,primeiro_topico,0,2,2023-10-04T00:22:44.013+0000,0
,eyJpZCI6IDEsICJub21lIjogIk1hdGhldXMiLCAiaWRhZGUiOiAxMH0=,primeiro_topico,0,3,2023-10-04T00:55:43.096+0000,0
,eyJpZCI6IDEsICJub21lIjogIkRhbmlsbyIsICJpZGFkZSI6IDEwfQ==,primeiro_topico,0,4,2023-10-04T01:03:40.677+0000,0
,WyJ7XCJpZFwiOlwiSmVzc2ljYSBGcmVlYm9yblwiLFwibmFtZVwiOlwiU2hvcnQgYnVyc3RzIG9mIGV4ZXJjaXNlIG1heSBsb3dlciB0aGUgcmlzayBvZiBoZWFydCBhdHRhY2sgYW5kIHN0cm9rZVwiLFwiYXV0aG9yXCI6XCI= (truncated),primeiro_topico,0,5,2023-10-04T01:42:16.044+0000,0
,WyJ7XCJpZFwiOlwiSmVzc2ljYSBGcmVlYm9yblwiLFwibmFtZVwiOlwiU2hvcnQgYnVyc3RzIG9mIGV4ZXJjaXNlIG1heSBsb3dlciB0aGUgcmlzayBvZiBoZWFydCBhdHRhY2sgYW5kIHN0cm9rZVwiLFwiYXV0aG9yXCI6XCI= (truncated),primeiro_topico,0,6,2023-10-04T01:46:06.741+0000,0
,WyJ7XCJpZFwiOlwiSmVzc2ljYSBGcmVlYm9yblwiLFwibmFtZVwiOlwiU2hvcnQgYnVyc3RzIG9mIGV4ZXJjaXNlIG1heSBsb3dlciB0aGUgcmlzayBvZiBoZWFydCBhdHRhY2sgYW5kIHN0cm9rZVwiLFwiYXV0aG9yXCI6XCI= (truncated),primeiro_topico,0,7,2023-10-04T01:47:11.613+0000,0
,WyJ7XCJpZFwiOlwiSmVzc2ljYSBGcmVlYm9yblwiLFwibmFtZVwiOlwiU2hvcnQgYnVyc3RzIG9mIGV4ZXJjaXNlIG1heSBsb3dlciB0aGUgcmlzayBvZiBoZWFydCBhdHRhY2sgYW5kIHN0cm9rZVwiLFwiYXV0aG9yXCI6XCI= (truncated),primeiro_topico,0,8,2023-10-04T01:49:28.779+0000,0
,IntcImlkXCI6XCJKZXNzaWNhIEZyZWVib3JuXCIsXCJuYW1lXCI6XCJTaG9ydCBidXJzdHMgb2YgZXhlcmNpc2UgbWF5IGxvd2VyIHRoZSByaXNrIG9mIGhlYXJ0IGF0dGFjayBhbmQgc3Ryb2tlXCIsXCJhdXRob3JcIjpcIkE= (truncated),primeiro_topico,0,9,2023-10-04T01:52:14.473+0000,0


Consumindo e transformando o contéudo de um tópico do Kafka com o Spark Streaming

In [0]:
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import IntegerType, StringType, StructType, StructField

df = spark.readStream.format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "primeiro_topico") \
        .option("startingOffsets", "earliest") \
        .load()

schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("author", StringType(), True),
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("url", StringType(), True),
    StructField("urlToImage", StringType(), True),
    StructField("publishedAt", StringType(), True),
    StructField("content", StringType(), True)
])

df_res = df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

df_res.writeStream.format("parquet") \
                  .option("path", "/FileStore/streaming_data/data") \
                  .option("checkpointLocation", "/FileStore/checkpoint_data/_checkpoint") \
                  .start() \
                  .awaitTermination()