# Pyspark - json flattening

In [4]:
import findspark
findspark.init()
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
import pandas as pd
sc = spark.sparkContext

In [9]:
import random
import datetime
def getCartItems():
    x = random.randint(1,2)
    x = x*100 
    y = x+10 
    data = {}
    now = datetime.datetime.now()
    
    data['item_number'] = random.randint(x,y)
    data['device_id'] = random.choice(['mobile','computer', 'tablet', 'mobile','computer'])
    data['click_event'] = random.choice(['search_jam','jam_checkout','jam_detail', 'food_products','jam_selection','add_to_cart'])
    return data

def getAddress():

    x = random.randint(1,2)
    x = x*100 
    y = x+10 
    customer = {}
    customer['customer_id'] =  'cust_' + str(random.randint(100,200))
    customer['age'] = random.randint(20,50)
    
    address = {}
    house_details = {}
    house_details['house_numer'] = random.randint(100,900)
    house_details['street'] = random.choice(['High st','Melibu drive', 'Ocean road', 'sunrise street','st kilda road'])
    address['house_details'] = house_details
    address['city'] = random.choice(['Melbourne','Sydney', 'Brisbane', 'Perth','Howbart'])
    customer['address'] = address
     
    return customer

def getOrder():
    customer_order = getAddress()
    cart_list = []
    for i in range(0,random.randint(1,3)):
        cart_list.append(getCartItems())

    order_details = {}

    now = datetime.datetime.now()
    str_now = now.isoformat()
    order_details['order_id'] = str_now
    order_details['discount'] = random.randint(5,15)
    order_details['items'] = cart_list    
    customer_order['order_details'] = order_details
    
    return customer_order    

In [10]:
json_data = str(getOrder())
json_data

"{'customer_id': 'cust_117', 'age': 31, 'address': {'house_details': {'house_numer': 528, 'street': 'Ocean road'}, 'city': 'Melbourne'}, 'order_details': {'order_id': '2022-10-02T13:15:06.569650', 'discount': 5, 'items': [{'item_number': 205, 'device_id': 'mobile', 'click_event': 'search_jam'}]}}"

In [18]:
data_rdd = sc.parallelize([json_data])
json_data_df = spark.read.option("multiline","true").json(data_rdd)
json_data_df.show()
json_data_df.printSchema()

+--------------------+---+-----------+--------------------+
|             address|age|customer_id|       order_details|
+--------------------+---+-----------+--------------------+
|[Melbourne, [528,...| 31|   cust_117|[5, [[search_jam,...|
+--------------------+---+-----------+--------------------+

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- house_details: struct (nullable = true)
 |    |    |-- house_numer: long (nullable = true)
 |    |    |-- street: string (nullable = true)
 |-- age: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_details: struct (nullable = true)
 |    |-- discount: long (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- click_event: string (nullable = true)
 |    |    |    |-- device_id: string (nullable = true)
 |    |    |    |-- item_number: long (nullable = true)
 |    |-- order_id: string (nullable 

### Write schema into json file from sample and read the schema for parsing multiple json files or json string column

In [34]:
# store the json pyspark schema from the sample json data into schema file
schema_json = json_data_df.schema.jsonValue()
import json
with open('./zschema1.json', 'w') as f:
    json.dump(schema_json, f)
    
import json
  
# load the json spark schema from the schema file and convert int pyspark schema object
zfile_schema = open('./zschema1.json')
data_json_schema = json.load(zfile_schema)
print(data_json_schema)
spark_json_schema = StructType.fromJson(data_json_schema)
print("\n",spark_json_schema)

{'type': 'struct', 'fields': [{'name': 'address', 'type': {'type': 'struct', 'fields': [{'name': 'city', 'type': 'string', 'nullable': True, 'metadata': {}}, {'name': 'house_details', 'type': {'type': 'struct', 'fields': [{'name': 'house_numer', 'type': 'long', 'nullable': True, 'metadata': {}}, {'name': 'street', 'type': 'string', 'nullable': True, 'metadata': {}}]}, 'nullable': True, 'metadata': {}}]}, 'nullable': True, 'metadata': {}}, {'name': 'age', 'type': 'long', 'nullable': True, 'metadata': {}}, {'name': 'customer_id', 'type': 'string', 'nullable': True, 'metadata': {}}, {'name': 'order_details', 'type': {'type': 'struct', 'fields': [{'name': 'discount', 'type': 'long', 'nullable': True, 'metadata': {}}, {'name': 'items', 'type': {'type': 'array', 'elementType': {'type': 'struct', 'fields': [{'name': 'click_event', 'type': 'string', 'nullable': True, 'metadata': {}}, {'name': 'device_id', 'type': 'string', 'nullable': True, 'metadata': {}}, {'name': 'item_number', 'type': 'lon

In [36]:
data_list = [(str(getOrder()), 1),(str(getOrder()), 2),(str(getOrder()), 3)]
data_df = spark.createDataFrame(data_list, ['value', 'record'])
data_df.show()
data_df.printSchema()

+--------------------+------+
|               value|record|
+--------------------+------+
|{'customer_id': '...|     1|
|{'customer_id': '...|     2|
|{'customer_id': '...|     3|
+--------------------+------+

root
 |-- value: string (nullable = true)
 |-- record: long (nullable = true)



### convert json string column into pyspark nested object

In [37]:
from pyspark.sql.functions import col, from_json
nested_schema_from_file  = spark_json_schema
parsedEventsDF = data_df.select(
  from_json(col("value"), nested_schema_from_file).alias("json_data")).select("json_data.*")

parsedEventsDF.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- house_details: struct (nullable = true)
 |    |    |-- house_numer: long (nullable = true)
 |    |    |-- street: string (nullable = true)
 |-- age: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_details: struct (nullable = true)
 |    |-- discount: long (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- click_event: string (nullable = true)
 |    |    |    |-- device_id: string (nullable = true)
 |    |    |    |-- item_number: long (nullable = true)
 |    |-- order_id: string (nullable = true)



In [58]:
from pyspark.sql import functions as F

renamed_df = parsedEventsDF.select([F.col(col).alias(col.replace('_', '')) for col in parsedEventsDF.columns])
renamed_df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- house_details: struct (nullable = true)
 |    |    |-- house_numer: long (nullable = true)
 |    |    |-- street: string (nullable = true)
 |-- age: long (nullable = true)
 |-- customerid: string (nullable = true)
 |-- orderdetails: struct (nullable = true)
 |    |-- discount: long (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- click_event: string (nullable = true)
 |    |    |    |-- device_id: string (nullable = true)
 |    |    |    |-- item_number: long (nullable = true)
 |    |-- order_id: string (nullable = true)



### flatten pyspark json dataframe into multiple columns

In [38]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

#Flatten array of structs and structs
def flatten_json(df):
    # compute Complex Fields (Lists and Structs) in Schema   
    complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
    
    while len(complex_fields)!=0:
        col_name=list(complex_fields.keys())[0]
#         print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))

        # if StructType then convert all sub element to columns.
        # i.e. flatten structs
        if (type(complex_fields[col_name]) == StructType):
            expanded = [col(col_name+'.'+k).alias(col_name+'_'+k) for k in [ n.name for n in  complex_fields[col_name]]]
            df=df.select("*", *expanded).drop(col_name)

        # if ArrayType then add the Array Elements as Rows using the explode function
        # i.e. explode Arrays
        elif (type(complex_fields[col_name]) == ArrayType):    
            df=df.withColumn(col_name,explode_outer(col_name))

        # recompute remaining Complex Fields in Schema       
        complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
    return df

In [39]:
parsedEventsDF.show()

+--------------------+---+-----------+--------------------+
|             address|age|customer_id|       order_details|
+--------------------+---+-----------+--------------------+
|[Melbourne, [377,...| 23|   cust_135|[10, [[jam_checko...|
|[Perth, [276, st ...| 35|   cust_159|[9, [[search_jam,...|
|[Howbart, [765, s...| 39|   cust_189|[12, [[add_to_car...|
+--------------------+---+-----------+--------------------+



In [41]:
data_final = flatten_json(parsedEventsDF)

data_final.toPandas().head(5).T

Unnamed: 0,0,1,2,3,4
age,23,35,35,39,39
customer_id,cust_135,cust_159,cust_159,cust_189,cust_189
address_city,Melbourne,Perth,Perth,Howbart,Howbart
order_details_discount,10,9,9,12,12
order_details_order_id,2022-10-02T13:36:36.256751,2022-10-02T13:36:36.256751,2022-10-02T13:36:36.256751,2022-10-02T13:36:36.256751,2022-10-02T13:36:36.256751
address_house_details_house_numer,377,276,276,765,765
address_house_details_street,st kilda road,st kilda road,st kilda road,sunrise street,sunrise street
order_details_items_click_event,jam_checkout,search_jam,jam_selection,add_to_cart,jam_detail
order_details_items_device_id,mobile,tablet,tablet,tablet,computer
order_details_items_item_number,106,101,110,110,206


In [5]:
from pyspark.sql.types import *
nested_schema2 = StructType([
  StructField("id", StringType()),
  StructField("type", StringType()),
  StructField("name", StringType()),
  StructField("ppu", StringType()),   
  StructField("batters", StructType([  StructField("batter", ArrayType(
      StructType([
          StructField("id", StringType()),
          StructField("type", StringType())
      ])
   ))  ])),   
  StructField("type1", StringType()),
  StructField("name1", StringType()),
  StructField("ppu1", StringType()),   
  StructField("batters1", StructType([  StructField("batter2", ArrayType(
      StructType([
          StructField("id", StringType()),
          StructField("type", StringType())
      ])
   ))  ])),   

  StructField("topping", ArrayType(
      StructType([
          StructField("id", StringType()),
          StructField("type", StringType())
      ])
   ))                          
])

### json flattening example2

In [63]:
json_str = """

{
	"programs": [{
		"id": "program_101",
		"programID": "10254",
		"orchestra": "New York Symphony",
		"season": "1926-27",
		"location": [{
			"eventType": "Subscription Season",
			"address": "Manhattan, NY",
			"Venue": "Carnegie Hall",
			"Date": "1926-10-02T05:00:00Z",
			"Time": "8:30PM"
		}],
		"childworkitems": [{
				"ID": "ITEM10",
				"composerName": "Low,",
				"workTitle": "INTERNATIONAL",
				"conductorName": "Shaeffer, Joseph",
				"soloists": [{
					"soloistName": "Freiheit Gesangs Farein of New York",
					"soloistInstrument": "Chorus",
					"soloistRoles": "S"
				}]
			},
			{
				"ID": "ITEM20",
				"composerName": "Edelshtadt,",
				"workTitle": "VACHT UF",
				"conductorName": "Shaeffer, Joseph",
				"soloists": [{
					"soloistName": "Freiheit Gesangs Farein of New York",
					"soloistInstrument": "Chorus",
					"soloistRoles": "S"
				}]
			}
		]
	},
    {
		"id": "program_201",
		"programID": "10254",
		"orchestra": "New York Symphony",
		"season": "1926-27",
		"location": [{
			"eventType": "Subscription Season",
			"address": "Manhattan, NY",
			"Venue": "Carnegie Hall",
			"Date": "1926-10-02T05:00:00Z",
			"Time": "8:30PM"
		}],
		"childworkitems": [{
				"ID": "ITEM30",
				"composerName": "Low,",
				"workTitle": "INTERNATIONAL",
				"conductorName": "Shaeffer, Joseph",
				"soloists": [{
					"soloistName": "Freiheit Gesangs Farein of New York",
					"soloistInstrument": "Chorus",
					"soloistRoles": "S"
				}]
			},
			{
				"ID": "ITEM40",
				"composerName": "Edelshtadt,",
				"workTitle": "VACHT UF",
				"conductorName": "Shaeffer, Joseph",
				"soloists": [{
					"soloistName": "Freiheit Gesangs Farein of New York",
					"soloistInstrument": "Chorus",
					"soloistRoles": "S"
				}]
			}
		]
	}]
}

"""

In [64]:
data_rdd = sc.parallelize([json_str])
df1 = spark.read.option("multiline","true").json(data_rdd)
df1.show()
df1.printSchema()

+--------------------+
|            programs|
+--------------------+
|[[[[ITEM10, Low,,...|
+--------------------+

root
 |-- programs: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- childworkitems: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- ID: string (nullable = true)
 |    |    |    |    |-- composerName: string (nullable = true)
 |    |    |    |    |-- conductorName: string (nullable = true)
 |    |    |    |    |-- soloists: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- soloistInstrument: string (nullable = true)
 |    |    |    |    |    |    |-- soloistName: string (nullable = true)
 |    |    |    |    |    |    |-- soloistRoles: string (nullable = true)
 |    |    |    |    |-- workTitle: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- location: array (nullab

In [65]:
data = flatten_json(df1)
data.toPandas().head().T

Unnamed: 0,0,1,2,3
programs_id,program_101,program_101,program_201,program_201
programs_orchestra,New York Symphony,New York Symphony,New York Symphony,New York Symphony
programs_programID,10254,10254,10254,10254
programs_season,1926-27,1926-27,1926-27,1926-27
programs_childworkitems_ID,ITEM10,ITEM20,ITEM30,ITEM40
programs_childworkitems_composerName,"Low,","Edelshtadt,","Low,","Edelshtadt,"
programs_childworkitems_conductorName,"Shaeffer, Joseph","Shaeffer, Joseph","Shaeffer, Joseph","Shaeffer, Joseph"
programs_childworkitems_workTitle,INTERNATIONAL,VACHT UF,INTERNATIONAL,VACHT UF
programs_location_Date,1926-10-02T05:00:00Z,1926-10-02T05:00:00Z,1926-10-02T05:00:00Z,1926-10-02T05:00:00Z
programs_location_Time,8:30PM,8:30PM,8:30PM,8:30PM


### read new json data with partial schema matching - 
#### missing fields with be populated with None and extra fields will be ignored

In [66]:
new_json_data = """

{
	"programs": [{
		"id": "program_101",
		"programID": "10254",
		"orchestra": "New York Symphony",
		"season": "1926-27",
		"location": [{
			"eventType": "Subscription Season",
			"address": "Manhattan, NY",
			"Venue": "Carnegie Hall",
			"Date": "1926-10-02T05:00:00Z",
			"Time": "8:30PM"
		}],
		"New_childworkitems": [{
				"ID": "ITEM10",
				"composerName": "Low,",
				"workTitle": "INTERNATIONAL",
				"conductorName": "Shaeffer, Joseph",
				"soloists": [{
					"soloistName": "Freiheit Gesangs Farein of New York",
					"soloistInstrument": "Chorus",
					"soloistRoles": "S"
				}]
			},
			{
				"ID": "ITEM20",
				"composerName": "Edelshtadt,",
				"workTitle": "VACHT UF",
				"conductorName": "Shaeffer, Joseph",
				"soloists": [{
					"soloistName": "Freiheit Gesangs Farein of New York",
					"soloistInstrument": "Chorus",
					"soloistRoles": "S"
				}]
			}
		]
	},
    {
		"id": "program_201",
		"programID": "10254",
		"orchestra": "New York Symphony",
		"season": "1926-27",
		"Invalid_location": [{
			"eventType": "Subscription Season",
			"address": "Manhattan, NY",
			"Venue": "Carnegie Hall",
			"Date": "1926-10-02T05:00:00Z",
			"Time": "8:30PM"
		}],
		"childworkitems": [{
				"ID": "ITEM30",
				"composerName": "Low,",
				"workTitle": "INTERNATIONAL",
				"conductorName": "Shaeffer, Joseph",
				"soloists": [{
					"soloistRoles": "S"
				}]
			},
			{
				"ID": "ITEM40",
				"composerName": "Edelshtadt,",
				"workTitle": "VACHT UF",
				"conductorName": "Shaeffer, Joseph",
				"soloists": [{
					"soloistName": "Freiheit Gesangs Farein of New York",
					"soloistRoles": "S"
				}]
			}
		]
	}]
}

"""

In [68]:
new_data_rdd = sc.parallelize([new_json_data])
# df2 = spark.read.option("multiline","true").json(new_data_rdd)
df2 = spark.read.schema(df1.schema).option("multiline","true").json(new_data_rdd)
df2data = flatten_json(df2)
df2data.toPandas().head().T

Unnamed: 0,0,1,2
programs_id,program_101,program_201,program_201
programs_orchestra,New York Symphony,New York Symphony,New York Symphony
programs_programID,10254,10254,10254
programs_season,1926-27,1926-27,1926-27
programs_childworkitems_ID,,ITEM30,ITEM40
programs_childworkitems_composerName,,"Low,","Edelshtadt,"
programs_childworkitems_conductorName,,"Shaeffer, Joseph","Shaeffer, Joseph"
programs_childworkitems_workTitle,,INTERNATIONAL,VACHT UF
programs_location_Date,1926-10-02T05:00:00Z,,
programs_location_Time,8:30PM,,
