In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType
from pyspark.sql.functions import from_json, col, lit, current_timestamp

In [None]:
spark = SparkSession.builder \
    .appName("KafkaSpark") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.5,org.apache.kafka:kafka-clients:3.9.1") \
    .master("yarn") \
    .getOrCreate()
spark.sparkContext.setLogLevel("WARN")

In [None]:
kafka_options = {
    "kafka.bootstrap.servers": "kafka:9092",
    "subscribe": "wiki",
    "startingOffsets": "latest"
}

In [None]:
df = spark.readStream \
    .format("kafka") \
    .options(**kafka_options) \
    .load()

In [None]:
schema = StructType([
    StructField("id", StringType()),
    StructField("wiki", StringType()),
    StructField("timestamp", IntegerType()),
    StructField("bot", BooleanType())
])
parsed_df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
    .select(from_json(col("value"), schema).alias("data"))

In [None]:
final_df = parsed_df.select(
    col("data.id").alias("id"),
    col("data.wiki").alias("wiki"),
    col("data.timestamp").alias("timestamp"),
    col("data.bot").alias("bot")
)

In [None]:
query = final_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()
query.awaitTermination()