In [22]:
from pathlib import Path
from IPython.display import display
import nbformat as nbf

# Define the cells of the notebook
cells = []


In [23]:
# Cell 1: Start Spark session with Kafka packages
cells.append(nbf.v4.new_code_cell("""\
from pyspark.sql import SparkSession

spark = SparkSession.builder \\
    .appName("KafkaStreamToParquet") \\
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \\
    .getOrCreate()
"""))


In [24]:
# Cell 2: Define schema
cells.append(nbf.v4.new_code_cell("""\
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType

schema = StructType() \\
    .add("user_id", IntegerType()) \\
    .add("event_type", StringType()) \\
    .add("target_id", IntegerType()) \\
    .add("timestamp", StringType()) \\
    .add("device", StringType()) \\
    .add("location", StringType())
"""))

In [25]:
# Cell 3: Read from Kafka
cells.append(nbf.v4.new_code_cell("""\
df_raw = spark.readStream \\
    .format("kafka") \\
    .option("kafka.bootstrap.servers", "host.docker.internal:9092") \\
    .option("subscribe", "social_events") \\
    .option("startingOffsets", "latest") \\
    .load()

df_json = df_raw.selectExpr("CAST(value AS STRING)") \\
    .select(from_json(col("value"), schema).alias("data")) \\
    .select("data.*")
"""))

In [27]:
# Cell 4: Write to Parquet
cells.append(nbf.v4.new_code_cell("""\
query = df_json.writeStream \\
    .format("parquet") \\
    .option("path", "/home/jovyan/work/parquet_output/") \\
    .option("checkpointLocation", "/home/jovyan/work/parquet_output/checkpoint/") \\
    .outputMode("append") \\
    .start()

query.awaitTermination()
"""))

# Assemble the notebook
nb = nbf.v4.new_notebook(cells=cells, metadata={"kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
}})

# Fallback: Save to mounted /mnt/data for direct download
download_path = "../stream-submit.ipynb"
with open(download_path, "w") as f:
    nbf.write(nb, f)

download_path


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/stream-submit.ipynb'