In [0]:
from pyspark.sql.functions import col, lit, explode, transform
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql import DataFrame
import pyspark.sql.utils
import pandas as pd
import json
from datetime import timedelta, datetime

In [0]:

source_path = "s3a://saiki-datalake-eu-central-1/data/eventqueue/customer.behaviour.tracking-events.test" ,
source_checkpoint = "s3a://....."

In [None]:
%pip install s3fs --quiet

In [0]:
# Get the schema of the source JSON files
import s3fs
source_schema = (
            spark.read
            .format("s3selectJSON")
            .json(f"{source_path}/dt=2024-09-16")
            ).schema

# Add dt partition field to the schema         
source_schema = source_schema.add(StructField('dt', StringType(), True))

In [None]:
# Stream read with schema
source_df = (
            spark.readStream
            .format("cloudFiles")
            .schema(source_schema)
            .option("cloudFiles.format", "json")
            .option("cloudFiles.schemaLocation", source_checkpoint)
            .load(f"{source_path}/dt=2024-09-17")
            .limit(5)
        )

source_df.display()

In [0]:
# Check the cols with struct type
for c in source_df.dtypes: 
    if c[1][:6] == 'struct':
        print("column:", c[0], "dtype:", c[1][:6])

column: consent dtype: struct
column: frontend dtype: struct
column: metadata dtype: struct


In [0]:
# Function to be used to flatten all struct type columns
def flatten_df(nested_df):
    flat_cols = []
    nested_cols = []
    
    # Separate flat columns from nested columns
    for column_name, dtype in nested_df.dtypes:
        if "." not in column_name and dtype.startswith("struct"):
            nested_cols.append(column_name)
        else:
            flat_cols.append(column_name)
    
    # Select flat columns
    selected_cols = [col(column) for column in flat_cols]
    
    # Unnest the nested columns
    for nested_col in nested_cols:
            expanded = [col(f"{nested_col}.{subfield}").alias(f"{nested_col}_{subfield}")
                        for subfield in nested_df.select(f"{nested_col}.*").columns
                       ]
       
            selected_cols.extend(expanded)         
    
    # Return the new streaming DataFrame
    return nested_df.select(*selected_cols)

In [None]:
# Flatten until all struct dtype fields are unnested
flattened_source_df = flatten_df(source_df)
flattened_source_df = flatten_df(flattened_source_df)
flattened_source_df = flatten_df(flattened_source_df)
flattened_source_df.display()

In [0]:
# Split the array type "skus" columns into seperate cols
splitted_df = flattened_source_df \
    .select(
        '*',
        col('frontend_event_params_skus').getItem(0).discount.alias("sku_discount"),
        col('frontend_event_params_skus').getItem(0).sku.alias("sku"),
        col('frontend_event_params_skus').getItem(0).sku_brand.alias("sku_brand"),
        col('frontend_event_params_skus').getItem(0).sku_name.alias("sku_name")
    ).drop('frontend_event_params_skus')

splitted_df.display()

In [0]:
# Function to convert array type fields to string array
def convert_array_to_string_array(df: DataFrame) -> DataFrame:
 
    array_columns = [field.name for field in df.schema.fields if isinstance(field.dataType, ArrayType)]
    
    for array_col in array_columns:
        df = df.withColumn(array_col, transform(col(array_col), lambda x: x.cast("string")))
    
    return df

In [0]:
string_df = convert_array_to_string_array(splitted_df)
string_df.display()

In [0]:
# Function to remove prefixes from the column names
def clean_column_names(df: DataFrame)-> DataFrame:
    new_column_names = [col_name.replace("metadata_","") \
                                .replace("frontend_event_params_","") \
                                .replace("frontend_device_", "") \
                                .replace("frontend_", "") \
                        for col_name in df.columns]
    
    cleaned_df = df.toDF(*new_column_names)

    return cleaned_df

In [None]:
cleaned_df = clean_column_names(string_df)
cleaned_df.display()

In [0]:
# Create function to rename the columns
def rename_columns(df: DataFrame)-> DataFrame:
    updated_df = df \
                    .withColumnRenamed("shipping_fee_amount", "shipping_fee_eur") \
                    .withColumnRenamed("tax_amount", "tax_eur") 

    return updated_df

In [None]:
# Rename the cols
renamed_df = rename_columns(cleaned_df)
renamed_df.display()