# Import Required Libraries

In [2]:
import findspark
findspark.init("/Users/DOU2274/spark/spark-3.1.1-bin-hadoop2.7")
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *



### create a spark session

In [None]:
spark = SparkSession.builder.appName("Working with json").getOrCreate()

### Read json file

In [40]:
path = 'data.json'

rawDF = spark.read.json(path, multiLine = "true")

### Explore DataFrame schema

In [6]:
rawDF.printSchema()

root
 |-- batters: struct (nullable = true)
 |    |-- batter: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ppu: double (nullable = true)
 |-- topping: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- type: string (nullable = true)



## Convert Nested Structure(array) to simple DF

### Rename the id column

In [7]:
sampleDF = rawDF.withColumnRenamed("id", "key") #rename the top-level “id” column because we have another “id” as a key of element struct under the batters.

### Select the batter column

In [8]:
batDF = sampleDF.select("key", "batters.batter")  #Extract batter element from the batters
batDF.printSchema()

root
 |-- key: string (nullable = true)
 |-- batter: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)



In [10]:
batDF.show()

+----+--------------------+
| key|              batter|
+----+--------------------+
|0001|[{1001, Regular},...|
+----+--------------------+



In [11]:
batDF.show(10, False)

+----+-----------------------------------------------------------------------------+
|key |batter                                                                       |
+----+-----------------------------------------------------------------------------+
|0001|[{1001, Regular}, {1002, Chocolate}, {1003, Blueberry}, {1004, Devil's Food}]|
+----+-----------------------------------------------------------------------------+



### Creating a row for each element : explode

We have got all the batter details in a single row because the batter is an Array of Struct. Let's try to create a separate row for each batter.


In [13]:
#create a separate row for each element of “batter” array by exploding “batter” column.
bat2DF = batDF.select("key", explode("batter").alias("new_batter"))
bat2DF.show()

+----+--------------------+
| key|          new_batter|
+----+--------------------+
|0001|     {1001, Regular}|
|0001|   {1002, Chocolate}|
|0001|   {1003, Blueberry}|
|0001|{1004, Devil's Food}|
+----+--------------------+



In [14]:
bat2DF.printSchema()

root
 |-- key: string (nullable = true)
 |-- new_batter: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- type: string (nullable = true)



In [15]:
bat2DF.select("key", "new_batter.*").show()

+----+----+------------+
| key|  id|        type|
+----+----+------------+
|0001|1001|     Regular|
|0001|1002|   Chocolate|
|0001|1003|   Blueberry|
|0001|1004|Devil's Food|
+----+----+------------+



In [16]:
bat2DF.select("key", "new_batter.id").show()

+----+----+
| key|  id|
+----+----+
|0001|1001|
|0001|1002|
|0001|1003|
|0001|1004|
+----+----+



In [17]:
bat2DF.select("key", "new_batter.type").show()

+----+------------+
| key|        type|
+----+------------+
|0001|     Regular|
|0001|   Chocolate|
|0001|   Blueberry|
|0001|Devil's Food|
+----+------------+



### Creating a row for each struct element : explode


In [18]:
finalBatDF = (sampleDF
        .select("key",  
explode("batters.batter").alias("new_batter"))
        .select("key", "new_batter.*")
        .withColumnRenamed("id", "bat_id")
        .withColumnRenamed("type", "bat_type"))
finalBatDF.show()

+----+------+------------+
| key|bat_id|    bat_type|
+----+------+------------+
|0001|  1001|     Regular|
|0001|  1002|   Chocolate|
|0001|  1003|   Blueberry|
|0001|  1004|Devil's Food|
+----+------+------------+



### Convert Nested “toppings” nested structure(Array) to Simple DataFrame

In [19]:
rawDF.printSchema()

root
 |-- batters: struct (nullable = true)
 |    |-- batter: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ppu: double (nullable = true)
 |-- topping: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- type: string (nullable = true)



In [36]:
topDF = sampleDF.select("key","topping")
topDF.printSchema()

root
 |-- key: string (nullable = true)
 |-- topping: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)



In [37]:
top2DF = topDF.select("key", explode("topping").alias("new_topping"))
top2DF.show()
top2DF.printSchema()

+----+--------------------+
| key|         new_topping|
+----+--------------------+
|0001|        {5001, None}|
|0001|      {5002, Glazed}|
|0001|       {5005, Sugar}|
|0001|{5007, Powdered S...|
|0001|{5006, Chocolate ...|
|0001|   {5003, Chocolate}|
|0001|       {5004, Maple}|
+----+--------------------+

root
 |-- key: string (nullable = true)
 |-- new_topping: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- type: string (nullable = true)



In [38]:
top3DF = top2DF.select("key", "new_topping.*")
top3DF.show(truncate=False)

+----+----+------------------------+
|key |id  |type                    |
+----+----+------------------------+
|0001|5001|None                    |
|0001|5002|Glazed                  |
|0001|5005|Sugar                   |
|0001|5007|Powdered Sugar          |
|0001|5006|Chocolate with Sprinkles|
|0001|5003|Chocolate               |
|0001|5004|Maple                   |
+----+----+------------------------+



In [39]:
finalTopDF = (topDF.select("key", explode("topping").alias("new_topping"))
        .select("key", "new_topping.*")
        .withColumnRenamed("id", "top_id")
        .withColumnRenamed("type", "top_type"))
finalTopDF.show(truncate=False)

+----+------+------------------------+
|key |top_id|top_type                |
+----+------+------------------------+
|0001|5001  |None                    |
|0001|5002  |Glazed                  |
|0001|5005  |Sugar                   |
|0001|5007  |Powdered Sugar          |
|0001|5006  |Chocolate with Sprinkles|
|0001|5003  |Chocolate               |
|0001|5004  |Maple                   |
+----+------+------------------------+

