In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
import pyspark.sql.functions as F
import pyspark
import socket
import yaml
import sys
import os

print(f"Running Spark {pyspark.__version__} on Python {sys.version}")

spark = (
    SparkSession.builder
    .appName("spark-jupyterhub")
    .master("k8s://https://kubernetes.default.svc.cluster.local")
    .config("spark.kubernetes.container.image", "erikperkins/spark:4.0.0")
    .config("spark.kubernetes.container.image.pullPolicy", "Always")
    .config("spark.kubernetes.namespace", "jupyterhub")
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "singleuser")
    .config("spark.kubernetes.authenticate.serviceAccountName", "singleuser")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.4.1,com.amazonaws:aws-java-sdk:1.12.788,software.amazon.awssdk:bundle:2.24.6")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .config("spark.hadoop.fs.s3a.access.key", os.environ["AWS_ACCESS_KEY_ID"])
    .config("spark.hadoop.fs.s3a.secret.key", os.environ["AWS_SECRET_ACCESS_KEY"])
    .config("spark.hadoop.fs.s3a.endpoint.region", "us-east-1") 
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.driver.host", socket.gethostbyname(socket.gethostname()))
    .config("spark.driver.port", 2222)
    .config("spark.driver.blockManager.port", 7777)
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.executor.instances", 2)
    .getOrCreate()
)

Running Spark 4.0.0 on Python 3.13.5 | packaged by conda-forge | (main, Jun 16 2025, 08:27:50) [GCC 13.3.0]


---
#### S3 Parquet

In [5]:
import pyspark.sql.functions as F

df = spark.read.parquet("s3a://minikube-jupyterhub-data/*.parquet")
df.groupBy("VendorID").agg({"passenger_count": "mean"}).show()

+--------+--------------------+
|VendorID|avg(passenger_count)|
+--------+--------------------+
|       1|  1.1323442533986385|
|       7|  1.2969443342111908|
|       2|  1.3644769487032042|
|       6|                NULL|
+--------+--------------------+



In [6]:
from pyspark.sql.functions import udf

slen = udf(lambda s: len(str(s)))

df.select(slen("fare_amount").alias("slen")).show(3)

+----+
|slen|
+----+
|   4|
|   4|
|   4|
+----+
only showing top 3 rows


---
#### NDJSON

In [6]:
yaml_schema = """
fields:
- metadata: {}
  name: apiVersion
  nullable: true
  type: string
- metadata: {}
  name: kind
  nullable: true
  type: string
- metadata: {}
  name: metadata
  nullable: true
  type:
    fields:
    - metadata: {}
      name: name
      nullable: true
      type: string
    - metadata: {}
      name: namespace
      nullable: true
      type: string
    - metadata: {}
      name: labels
      nullable: true
      type:
        fields:
        - metadata: {}
          name: name
          nullable: true
          type: string
        type: struct
    type: struct
type: struct
"""

schema = StructType.fromJson(yaml.safe_load(yaml_schema))
print(schema.treeString())

df = spark.read.schema(schema).json("s3a://minikube-jupyterhub-data/*.ndjson")
df.filter(F.col("metadata.name") == "snowflake").show(3)

Running Spark 4.0.0 on Python 3.13.5 | packaged by conda-forge | (main, Jun 16 2025, 08:27:50) [GCC 13.3.0]
root
 |-- apiVersion: string (nullable = true)
 |-- kind: string (nullable = true)
 |-- metadata: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- namespace: string (nullable = true)
 |    |-- labels: struct (nullable = true)
 |    |    |-- name: string (nullable = true)

+----------+----------+--------------------+
|apiVersion|      kind|            metadata|
+----------+----------+--------------------+
|        v1| Namespace|{snowflake, NULL,...|
|   apps/v1|Deployment|{snowflake, snowf...|
+----------+----------+--------------------+

