In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
import pyspark.sql.functions as F
import pyspark
import socket
import yaml
import sys
import os

REGION = "us-east-1"
BUCKET = "minikube-jupyterhub-data"

print(f"Running Spark {pyspark.__version__} on Python {sys.version}")

spark = (
    SparkSession.builder
    .appName("spark-jupyterhub")
    .master("spark://master.spark.svc.cluster.local:7077")
    .config("spark.driver.host", socket.gethostbyname(socket.gethostname()))
    .config("spark.driver.port", 2222)
    .config("spark.driver.blockManager.port", 7777)
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.jars", "/home/jovan/hadoop-aws-3.4.1.jar,/home/jovyan/bundle-2.24.6.jar,/home/jovyan/openssl_wildfly-openssl-1.1.3.Final.jar")
    .config("spark.extraListeners", "sparkmonitor.listener.JupyterSparkMonitorListener")
    .config("spark.driver.extraClassPath", "/opt/conda/lib/python3.13/site-packages/sparkmonitor/listener_2.13.jar")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .config("spark.hadoop.fs.s3a.access.key", os.environ["AWS_ACCESS_KEY_ID"])
    .config("spark.hadoop.fs.s3a.secret.key", os.environ["AWS_SECRET_ACCESS_KEY"])
    .config("spark.hadoop.fs.s3a.endpoint.region", REGION)
    .config("spark.ui.reverseProxy", "true")
    .getOrCreate()
)

Running Spark 4.0.0 on Python 3.13.5 | packaged by conda-forge | (main, Jun 16 2025, 08:27:50) [GCC 13.3.0]


---
#### NDJSON

In [3]:
yaml_schema = """
fields:
- metadata: {}
  name: apiVersion
  nullable: true
  type: string
- metadata: {}
  name: kind
  nullable: true
  type: string
- metadata: {}
  name: metadata
  nullable: true
  type:
    fields:
    - metadata: {}
      name: name
      nullable: true
      type: string
    - metadata: {}
      name: namespace
      nullable: true
      type: string
    - metadata: {}
      name: labels
      nullable: true
      type:
        fields:
        - metadata: {}
          name: name
          nullable: true
          type: string
        type: struct
    type: struct
type: struct
"""

schema = StructType.fromJson(yaml.safe_load(yaml_schema))
df = spark.read.schema(schema).json(f"s3a://{BUCKET}/*.ndjson.gz")
df.filter(F.col("metadata.name") == "snowflake").show(3)

+----------+----------+--------------------+
|apiVersion|      kind|            metadata|
+----------+----------+--------------------+
|        v1| Namespace|{snowflake, NULL,...|
|   apps/v1|Deployment|{snowflake, snowf...|
+----------+----------+--------------------+



---
#### S3 Parquet

In [4]:
import pyspark.sql.functions as F

df = spark.read.parquet(f"s3a://{BUCKET}/*.parquet")
df.groupBy("VendorID").agg({"passenger_count": "mean"}).show()

+--------+--------------------+
|VendorID|avg(passenger_count)|
+--------+--------------------+
|       1|  1.1323442533986385|
|       7|  1.2969443342111908|
|       2|  1.3644769487032042|
|       6|                NULL|
+--------+--------------------+



In [5]:
from pyspark.sql.functions import udf

slen = udf(lambda s: len(str(s)))

df.select(slen("fare_amount").alias("slen")).show(3)

+----+
|slen|
+----+
|   4|
|   4|
|   4|
+----+
only showing top 3 rows
