In [6]:
import sys
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.transforms import *
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.types import *
from pyspark.sql import Row
glueContext = GlueContext(SparkContext.getOrCreate())


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
order_list = [
               ['1005', '623', 'YES', '1418901234', '75091', u'{ "zip": 90093, "customers": [{ "id": 932, "address": "708 Fed Ln, CA"}, { "id": 102, "address": "807 Deccan Dr, CA" }]}'],\
               ['1006', '547', 'NO', '1418901256', '75034', u'{ "zip": 75023, "customers": [{ "id": 343, "address": "66 P Street, NY" }]}'],\
               ['1007', '823', 'YES', '1418901300', '75023', u'{ "zip": 82091, "customers": [{ "id": 201, "address": "771 Peek Pkwy, GA" }]}'],\
               ['1008', '912', 'NO', '1418901400', '82091',  u'{ "zip": 75091, "customers": [{ "id": 623, "address": "108 Park Street, TX"}, { "id": 231, "address": "763 Marsh Ln, TX" }]}'],\
               ['1009', '321', 'YES', '1418902000', '90093',  u'{ "zip": 75091, "customers": [{ "id": 624, "address": "108 Park Street, TX"}, { "id": 231, "address": "763 Marsh Ln, TX" }]}']\
             ]


# Define schema for the order_list
order_schema = StructType([  
                      StructField("order_id", StringType()),
                      StructField("customer_id", StringType()),
                      StructField("essential_item", StringType()),
                      StructField("timestamp", StringType()),
                      StructField("zipcode", StringType()),
                      StructField("details", StringType())
                    ])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
df_orders = spark.createDataFrame(order_list, schema = order_schema)
df_orders.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+-----------+--------------+----------+-------+--------------------+
|order_id|customer_id|essential_item| timestamp|zipcode|             details|
+--------+-----------+--------------+----------+-------+--------------------+
|    1005|        623|           YES|1418901234|  75091|{ "zip": 90093, "...|
|    1006|        547|            NO|1418901256|  75034|{ "zip": 75023, "...|
|    1007|        823|           YES|1418901300|  75023|{ "zip": 82091, "...|
|    1008|        912|            NO|1418901400|  82091|{ "zip": 75091, "...|
|    1009|        321|           YES|1418902000|  90093|{ "zip": 75091, "...|
+--------+-----------+--------------+----------+-------+--------------------+

In [11]:
dyf_orders = DynamicFrame.fromDF(df_orders, glueContext, "dyf") 
dyf_orders.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
|-- order_id: string
|-- customer_id: string
|-- essential_item: string
|-- timestamp: string
|-- zipcode: string
|-- details: string

In [12]:
unbox_orders = Unbox.apply(dyf_orders, path="details", format="json", transformation_ctx="unbox_orders")
unbox_orders.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
|-- order_id: string
|-- customer_id: string
|-- essential_item: string
|-- timestamp: string
|-- zipcode: string
|-- details: struct
|    |-- zip: int
|    |-- customers: array
|    |    |-- element: struct
|    |    |    |-- id: int
|    |    |    |-- address: string

In [20]:
def addDetails(row):
    row["address"] = row["details"]["customers"][0]["address"]
    return row

map_orders = Map.apply(frame=unbox_orders, f=addDetails, transformation_ctx="map_orders")
map_orders.printSchema()
map_orders.toDF().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
|-- zipcode: string
|-- address: string
|-- essential_item: string
|-- customer_id: string
|-- details: struct
|    |-- customers: array
|    |    |-- element: struct
|    |    |    |-- address: string
|    |    |    |-- id: int
|    |-- zip: int
|-- timestamp: string
|-- order_id: string

+-------+-------------------+--------------+-----------+--------------------+----------+--------+
|zipcode|            address|essential_item|customer_id|             details| timestamp|order_id|
+-------+-------------------+--------------+-----------+--------------------+----------+--------+
|  75091|     708 Fed Ln, CA|           YES|        623|{[{708 Fed Ln, CA...|1418901234|    1005|
|  75034|    66 P Street, NY|            NO|        547|{[{66 P Street, N...|1418901256|    1006|
|  75023|  771 Peek Pkwy, GA|           YES|        823|{[{771 Peek Pkwy,...|1418901300|    1007|
|  82091|108 Park Street, TX|            NO|        912|{[{108 Park Stree...|1418901400|    1008|
|  90093|108 Park 