In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, lit, explode
import pyspark.sql.utils
import json

In [0]:
source_path = "s3a://zalando-saiki-datalake-eu-central-1/data/eventqueue/customer.behaviour.tracking-events.test"

In [0]:
dbutils.fs.ls(source_path)[-2:-1]

[FileInfo(path='s3a://zalando-saiki-datalake-eu-central-1/data/eventqueue/customer.behaviour.tracking-events.test/dt=2024-09-16/', name='dt=2024-09-16/', size=0, modificationTime=1726597629078)]

In [None]:
# Read json file
nested_df = (
        spark.read
        .json(f"{source_path}/dt=2024-09-15")
        .limit(5)
        )

nested_df.display()

In [0]:
# Check the col with struct type
for c in nested_df.dtypes: 
    if c[1][:6] == 'struct':
        print("column:", c[0], "dtype:", c[1][:6])

column: consent dtype: struct
column: frontend dtype: struct
column: metadata dtype: struct


In [None]:
# Unnest all cols without aliases
unnested_df_all = nested_df \
                .select("*", col('consent.*')).drop('consent') \
                .select("*", col('frontend.*')).drop('frontend') \
                .select("*", col('metadata.*')).drop('metadata')

unnested_df_all.display()

In [None]:
# Unnest the cols with alias
unnested_df_aliases = nested_df \
        .select( '*'
                ,col('consent.consented_categories').alias('consented_categories') 
            ).drop('consent') \
        .select( '*'
                ,col('frontend.app').alias('app') 
                ,col('frontend.device').alias('device')  
                ,col('frontend.event_name').alias('event_name')
            ).drop('frontend') \
        .select( '*'
                ,col('metadata.event_type').alias('event_type') 
                ,col('metadata.flow_id').alias('flow_id')  
            ).drop('metadata')

unnested_df_aliases.display()

In [0]:
# Create a flatening function
def flatten_df(nested_df: DataFrame) -> DataFrame:
    flat_cols = []
    nested_cols = []
    
    # Separate flat columns from nested columns
    for column_name, dtype in nested_df.dtypes:
        if "." not in column_name and dtype.startswith("struct"):
            nested_cols.append(column_name)
        else:
            flat_cols.append(column_name)
    
    # Select flat columns
    selected_cols = [col(column) for column in flat_cols]
    
    # Unnest the nested columns
    for nested_col in nested_cols:
        expanded = [col(f"{nested_col}.{subfield}").alias(f"{nested_col}_{subfield}")
                    for subfield in nested_df.select(f"{nested_col}.*").columns]
        selected_cols.extend(expanded)
    
    # Return the new DataFrame
    return nested_df.select(*selected_cols)

In [None]:
# Unnest until all struct type fields are flattened
flattened_df = flatten_df(nested_df)
flattened_df = flatten_df(flattened_df)
flattened_df = flatten_df(flattened_df)

flattened_df.display()

In [0]:
# Check if there is still struct dtype
for c in flattened_df.dtypes: 
    if c[1][:6] == 'struct':
        print("column:", c[0], "dtype:", c[1][:6])