In [0]:
from pyspark.sql.functions import explode, col

In [0]:
# Sample data representing customers and their purchases
data = [
    {
        "customer_id": 1,
        "name": "John Doe",
        "email": "john.doe@example.com",
        "purchases": [
            {"purchase_id": 101, "item": "Laptop", "amount": 1200.00, "date": "2023-01-15"},
            {"purchase_id": 102, "item": "Mouse", "amount": 25.00, "date": "2023-01-20"}
        ]
    },
    {
        "customer_id": 2,
        "name": "Jane Smith",
        "email": "jane.smith@example.com",
        "purchases": [
            {"purchase_id": 103, "item": "Keyboard", "amount": 75.00, "date": "2023-02-10"}
        ]
    }
]

In [0]:
# Create a DataFrame directly from the list of dictionaries
df = spark.createDataFrame(data)
df.display()


In [0]:
result = df.write.mode("overwrite").saveAsTable("customers_json")

In [0]:
%sql

select customer_id, email, name, purchases, p.purchase_id, p.item, p.amount, p.date
from customers_json
lateral view explode(purchases) as p

In [0]:

# Flatten the nested structure by exploding the purchases array
flattened_df = df.select(
    col("customer_id"),
    col("name"),
    col("email"),
    explode(col("purchases")).alias("purchase")
).select(
    col("customer_id"),
    col("name"),
    col("email"),
    col("purchase.purchase_id"),
    col("purchase.item"),
    col("purchase.amount"),
    col("purchase.date")
)

# Display the result
flattened_df.display()

In [0]:
result = flattened_df.write.mode("overwrite").saveAsTable("customers_flattened")

In [0]:
%sql

select * from customers_flattened