In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local[*]")\
        .appName("Flatten_Json")\
        .getOrCreate()

spark

In [2]:
# All the required imports.
from pyspark.sql.functions import col , explode 


In [3]:
# reading the json file
df_json = spark.read.format("json").option("multiline", True).load("json_data.json")


In [4]:
# schema of the json file 
df_json.printSchema()

root
 |-- Course_type: string (nullable = true)
 |-- Head_Office_Contact: long (nullable = true)
 |-- Institute_Name: string (nullable = true)
 |-- branches: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- City: string (nullable = true)
 |    |    |-- State: string (nullable = true)
 |    |    |-- address: string (nullable = true)



In [5]:
df_json.show(truncate=False)

+-----------+-------------------+-------------------+---------------------------------------------------+
|Course_type|Head_Office_Contact|Institute_Name     |branches                                           |
+-----------+-------------------+-------------------+---------------------------------------------------+
|Best_seller|8787878787         |ABC_Coaching_Center|[{Mumbai, Maharashtra, XYZ}, {Surat, Gujrat, PQRX}]|
+-----------+-------------------+-------------------+---------------------------------------------------+



## Formula : array --> explode , struct --> column_name.*

In [6]:
# explode 
df_exploded = df_json.select("Course_type","Head_Office_Contact", "Institute_Name", explode(col("branches")).alias("branches"))

In [8]:
df_exploded.show()

+-----------+-------------------+-------------------+--------------------+
|Course_type|Head_Office_Contact|     Institute_Name|            branches|
+-----------+-------------------+-------------------+--------------------+
|Best_seller|         8787878787|ABC_Coaching_Center|{Mumbai, Maharash...|
|Best_seller|         8787878787|ABC_Coaching_Center|{Surat, Gujrat, P...|
+-----------+-------------------+-------------------+--------------------+



In [9]:
# dealing with the struct type of data 
df_exploded = df_exploded.select("Course_type","Head_Office_Contact", "Institute_Name", "branches.*")

In [10]:
df_exploded.show()

+-----------+-------------------+-------------------+------+-----------+-------+
|Course_type|Head_Office_Contact|     Institute_Name|  City|      State|address|
+-----------+-------------------+-------------------+------+-----------+-------+
|Best_seller|         8787878787|ABC_Coaching_Center|Mumbai|Maharashtra|    XYZ|
|Best_seller|         8787878787|ABC_Coaching_Center| Surat|     Gujrat|   PQRX|
+-----------+-------------------+-------------------+------+-----------+-------+

