In [27]:
from pyspark.sql.types import *
import multiprocessing

from pyspark.sql.functions import from_json
from pyspark.sql.functions import get_json_object

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [28]:
STOP_SPARK_CONTEXT = True
NUM_CORES = 8

maven_artifacts = [
    "io.delta:delta-core_2.12:1.0.0",
    "org.apache.hadoop:hadoop-aws:3.2.0",
]

spark_config = {
    # Timezone
    "spark.sql.session.timeZone": "UTC",
    # Spark config
    "spark.sql.shuffle.partitions": "4",
    "spark.sql.parquet.compression.codec": "uncompressed",
    "spark.sql.sources.parallelPartitionDiscovery.parallelism": "4",
}

if STOP_SPARK_CONTEXT and "spark" in locals():
    locals()["spark"].stop()
    print("Spark Context stopped")

spark = (
    SparkSession.builder.appName("jupyter_pyspark")
    .master(f"local[{min(NUM_CORES, multiprocessing.cpu_count())}]")
    .config(conf=SparkConf().setAll(spark_config.items()))
    .getOrCreate()
)

spark

Spark Context stopped


22/01/25 20:36:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [15]:
data = [(1, """{"a": 1}""")]

In [16]:
schema = StructType([StructField("a", IntegerType())])

In [17]:
df = spark.createDataFrame(data, ("key", "value"))

In [18]:
df.printSchema()
df.show()

root
 |-- key: long (nullable = true)
 |-- value: string (nullable = true)

+---+--------+
|key|   value|
+---+--------+
|  1|{"a": 1}|
+---+--------+



In [19]:
df.select(from_json(df.value, schema).alias("json")).collect()

[Row(json=Row(a=1))]

In [24]:
df.withColumn("json_col", from_json(df.value, schema).alias("json")).show()

+---+--------+--------+
|key|   value|json_col|
+---+--------+--------+
|  1|{"a": 1}|     {1}|
+---+--------+--------+



In [25]:
df.printSchema()
df.show()

root
 |-- key: long (nullable = true)
 |-- value: string (nullable = true)

+---+--------+
|key|   value|
+---+--------+
|  1|{"a": 1}|
+---+--------+



In [30]:
data = [("1", """{"f1": "value1", "f2": "value2"}"""), ("2", """{"f1": "value12"}""")]
df = spark.createDataFrame(data, ("key", "jstring"))

In [31]:
df.printSchema()
df.show()

root
 |-- key: string (nullable = true)
 |-- jstring: string (nullable = true)

+---+--------------------+
|key|             jstring|
+---+--------------------+
|  1|{"f1": "value1", ...|
|  2|   {"f1": "value12"}|
+---+--------------------+



In [34]:
df.select(
    df.key,
    get_json_object(df.jstring, "$.f1").alias("c0"),
    get_json_object(df.jstring, "$.f2").alias("c1"),
).show()

+---+-------+------+
|key|     c0|    c1|
+---+-------+------+
|  1| value1|value2|
|  2|value12|  null|
+---+-------+------+

