In [1]:
import pathlib
from datetime import datetime
from typing import List, Tuple, Union, Dict

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Column

from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import Window


In [2]:
# master configuration to use only 4 CPU cores
spark = SparkSession.builder.master("local[4]").getOrCreate()

# basic configuration to use only a reasonable number of partitions
spark.conf.set("spark.sql.shuffle.partition", 4)

# configuration to work in UTC
spark.conf.set("spark.sql.session.timeZone", "UTC")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/12 11:54:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/12 11:54:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/07/12 11:54:19 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [35]:
path = "/Users/emilianofrigo/Downloads/1689044938226_0.jsonl"

# Read JSONL file
df = (
    spark
    .read
    # .option("multiline", "true")
    .json(path)
)

In [36]:
df.printSchema()

root
 |-- _airbyte_ab_id: string (nullable = true)
 |-- _airbyte_data: struct (nullable = true)
 |    |-- AdjustmentEventList: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- AdjustmentAmount: struct (nullable = true)
 |    |    |    |    |-- CurrencyAmount: double (nullable = true)
 |    |    |    |    |-- CurrencyCode: string (nullable = true)
 |    |    |    |-- AdjustmentItemList: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- PerUnitAmount: struct (nullable = true)
 |    |    |    |    |    |    |-- CurrencyAmount: double (nullable = true)
 |    |    |    |    |    |    |-- CurrencyCode: string (nullable = true)
 |    |    |    |    |    |-- ProductDescription: string (nullable = true)
 |    |    |    |    |    |-- Quantity: string (nullable = true)
 |    |    |    |    |    |-- SellerSKU: string (nullable = true)
 |    |    |    |    |    |-- TotalAmount: struct 

In [5]:
df.show(truncate=False)

                                                                                

+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [66]:
def explode(df: DataFrame, config: dict) -> DataFrame:
    columns = config["columns"]
    
    if not isinstance(columns, list):
        columns = [columns]
        print(columns)

    for column in columns:
        df.withColumn(column, F.explode(column))
    
    return df

In [56]:
def unnest(df: DataFrame, config: dict) -> DataFrame:
    for column in config["columns"]:
        column_to_extract = column["column"]
        new_name = column["rename_to"]
        df = df.withColumn(new_name, F.col(column_to_extract))
    return df

In [65]:
explode_dict = {
    "columns":"_airbyte_data.ShipmentEventList",
}

In [63]:
df_exploded = explode(df=df, config=explode_dict)

#df_exploded.printSchema()
df_exploded.show()

+--------------------+--------------------+-------------------+
|      _airbyte_ab_id|       _airbyte_data|_airbyte_emitted_at|
+--------------------+--------------------+-------------------+
|ed84cf8d-05ce-4ae...|{[{{-13.49, EUR},...|      1689044967267|
+--------------------+--------------------+-------------------+



In [68]:
df_manual_explode = (
    df
    .withColumn("ShipmentEventList", F.explode("_airbyte_data.ShipmentEventList"))
)

In [69]:
df_manual_explode.printSchema()

root
 |-- _airbyte_ab_id: string (nullable = true)
 |-- _airbyte_data: struct (nullable = true)
 |    |-- AdjustmentEventList: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- AdjustmentAmount: struct (nullable = true)
 |    |    |    |    |-- CurrencyAmount: double (nullable = true)
 |    |    |    |    |-- CurrencyCode: string (nullable = true)
 |    |    |    |-- AdjustmentItemList: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- PerUnitAmount: struct (nullable = true)
 |    |    |    |    |    |    |-- CurrencyAmount: double (nullable = true)
 |    |    |    |    |    |    |-- CurrencyCode: string (nullable = true)
 |    |    |    |    |    |-- ProductDescription: string (nullable = true)
 |    |    |    |    |    |-- Quantity: string (nullable = true)
 |    |    |    |    |    |-- SellerSKU: string (nullable = true)
 |    |    |    |    |    |-- TotalAmount: struct 

In [73]:
unnest_cols = {
    "columns": [
        {"column": "ShipmentEventList.AmazonOrderId", "rename_to": "amazon_order_id"},
    ]
}

# df_unnested = unnest(df_manual_explode, unnest_cols)

In [76]:
for column in unnest_cols["columns"]:
    print(column)
    column_to_extract = column["column"]
    new_name = column["rename_to"]
    df_manual_explode2 = df_manual_explode.withColumn(new_name, F.col(column_to_extract))


{'column': 'ShipmentEventList.AmazonOrderId', 'rename_to': 'amazon_order_id'}


In [78]:
df_manual_explode2.show()

+--------------------+--------------------+-------------------+--------------------+-------------------+
|      _airbyte_ab_id|       _airbyte_data|_airbyte_emitted_at|   ShipmentEventList|    amazon_order_id|
+--------------------+--------------------+-------------------+--------------------+-------------------+
|ed84cf8d-05ce-4ae...|{[{{-13.49, EUR},...|      1689044967267|{202-7551542-5478...|202-7551542-5478708|
|ed84cf8d-05ce-4ae...|{[{{-13.49, EUR},...|      1689044967267|{026-0479204-0943...|026-0479204-0943514|
|ed84cf8d-05ce-4ae...|{[{{-13.49, EUR},...|      1689044967267|{204-6668810-1251...|204-6668810-1251519|
|ed84cf8d-05ce-4ae...|{[{{-13.49, EUR},...|      1689044967267|{171-8643008-7651...|171-8643008-7651515|
|ed84cf8d-05ce-4ae...|{[{{-13.49, EUR},...|      1689044967267|{206-1458711-5286...|206-1458711-5286725|
|ed84cf8d-05ce-4ae...|{[{{-13.49, EUR},...|      1689044967267|{202-4247892-6654...|202-4247892-6654724|
|ed84cf8d-05ce-4ae...|{[{{-13.49, EUR},...|      168904