In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --driver-memory 3g pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

conf = SparkConf()
conf.set("spark.app.name", f"""example_spark""") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

In [2]:
# Example JSON value in Column
t = """{"lvl1":[{"col1":"BLOCKED","col2":123,"col3":null,"col4":456,"col5":"Text2 (Text3)"}]}"""
print(t)

{"lvl1":[{"col1":"BLOCKED","col2":123,"col3":null,"col4":456,"col5":"Text2 (Text3)"}]}


In [3]:
import pandas as pd
df=pd.DataFrame([t], columns=['js'])
df

Unnamed: 0,js
0,"{""lvl1"":[{""col1"":""BLOCKED"",""col2"":123,""col3"":n..."


In [4]:
sdf=spark.createDataFrame(df)
sdf.printSchema()

root
 |-- js: string (nullable = true)



In [5]:
sdf.show(1,100,True)

-RECORD 0-------------------------------------------------------------------------------------
 js  | {"lvl1":[{"col1":"BLOCKED","col2":123,"col3":null,"col4":456,"col5":"Text2 (Text3)"}]} 



In [6]:
# Magic create schema JSON
params_list = sdf.select("js").rdd.flatMap(lambda x: x).collect()
params_rdd = spark.sparkContext.parallelize(params_list)
schema = spark.read.json(params_rdd).schema
schema

StructType(List(StructField(lvl1,ArrayType(StructType(List(StructField(col1,StringType,true),StructField(col2,LongType,true),StructField(col3,StringType,true),StructField(col4,LongType,true),StructField(col5,StringType,true))),true),true)))

In [7]:
# Use new schema for JSON
sdf2 = sdf.withColumn( "lvl2", F.from_json( F.col("js"), schema ) )
sdf2.printSchema()

root
 |-- js: string (nullable = true)
 |-- lvl2: struct (nullable = true)
 |    |-- lvl1: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- col1: string (nullable = true)
 |    |    |    |-- col2: long (nullable = true)
 |    |    |    |-- col3: string (nullable = true)
 |    |    |    |-- col4: long (nullable = true)
 |    |    |    |-- col5: string (nullable = true)



In [8]:
sdf3 = (sdf
    .withColumn( "lvl2", F.from_json( F.col("js"), schema ) )
    .withColumn( "col1", F.explode(F.col("lvl2.lvl1.col1")) )
    .withColumn( "col2", F.explode(F.col("lvl2.lvl1.col2")) )
    .withColumn( "col3", F.explode(F.col("lvl2.lvl1.col3")) )
    .withColumn( "col4", F.explode(F.col("lvl2.lvl1.col4")) )
    .withColumn( "col5", F.explode(F.col("lvl2.lvl1.col5")) )
    .drop(F.col("lvl2"))
)
sdf3.printSchema()

root
 |-- js: string (nullable = true)
 |-- col1: string (nullable = true)
 |-- col2: long (nullable = true)
 |-- col3: string (nullable = true)
 |-- col4: long (nullable = true)
 |-- col5: string (nullable = true)



In [9]:
sdf3.toPandas()

Unnamed: 0,js,col1,col2,col3,col4,col5
0,"{""lvl1"":[{""col1"":""BLOCKED"",""col2"":123,""col3"":n...",BLOCKED,123,,456,Text2 (Text3)
