In [55]:
from pyspark.sql.functions import explode, split, from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType

In [28]:
import os
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("StreamingTest") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

In [53]:
df = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "kafka:9092").option("subscribe", "bluesky-a1cb100f").option("includeHeaders", "true").load()

In [56]:
schema = StructType([ 
        StructField("did", StringType(), True),
        StructField("time_us" , LongType(), True),
        StructField("kind" , StringType(), True),
        StructField("commit" , StructType([ 
            StructField("rev", StringType(), True),
            StructField("operation" , StringType(), True),
            StructField("collection" , StringType(), True),
            StructField("rkey" , StringType(), True),
            StructField("record" , StructType([ 
                StructField("$type", StringType(), True),
                StructField("createdAt" , StringType(), True),
                StructField("text" , StringType(), True),
                ]), True),
            StructField("cid" , StringType(), True),
            ]), True),
        ])

df = df.select(from_json(col("value").cast("string"), schema).alias("value"))

In [57]:
df = df.select(
    col("value.did").alias("did"), 
    col("value.time_us").alias("time_us"), 
    col("value.kind").alias("kind"), 
    col("value.commit.rev").alias("rev"),
    col("value.commit.operation").alias("operation"),
    col("value.commit.collection").alias("collection"),
    col("value.commit.rkey").alias("rkey"),
    col("value.commit.record.$type").alias("$type"),
    col("value.commit.record.createdAt").alias("createdAt"),
    col("value.commit.record.text").alias("text"),
    col("value.commit.cid").alias("cid"),
)

In [None]:
df.writeStream.format("console").outputMode("append").start().awaitTermination()