- This script is reading JSON format currency data which includes 31 different currency code compare to EUR.
- The weekends data are not available in the source
- Generally the yesterday records are coming late. We may also need today records to use them in streaming pipelines
- Thats why the functions below will fill weekends and also add buffer days for today and tomorrow.
- Then it will flatten the data and update the columns.

In [0]:
import time
from datetime import datetime, timedelta
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, to_timestamp, date_sub, current_date, to_date, date_format, when, min, max
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DateType, DoubleType

In [0]:
curreny_from = date_sub(current_date(), 7)

In [0]:
currency_source_path = "s3://zalando-datalake-binary/event-types/data/exchange-rate-service.exchange-rate-change"

In [0]:
# Read the currency data
currency_bulk_df = (
            spark.read
                .format("delta")  
                .load(currency_source_path)
                .select(
                        to_date(col('dt'), "yyyy-MM-dd").alias("date") 
                        ,col("data")
                        )
                .filter(col("date") >= curreny_from ) 
        )

In [0]:
currency_bulk_df.filter(col('date') == '2024-12-24').display()

date,data
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, HRK, 7.5345, null)"
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, USD, 1.0395, null)"
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, JPY, 163.25, null)"
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, BGN, 1.9558, null)"
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, CZK, 25.135, null)"
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, DKK, 7.4608, null)"
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, GBP, 0.82805, null)"
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, HUF, 411.73, null)"
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, PLN, 4.2715, null)"
2024-12-24,"List(2024-12-24T14:59:00.002Z, EUR, 1.0, RON, 4.9745, null)"


In [0]:
def flatten_df(spark, nested_df: DataFrame, max_depth: int = 4) -> DataFrame:
    """ Recursively flattens nested columns in a DataFrame """
    
    def flatten_once(df: DataFrame) -> DataFrame:
        flat_cols = []
        nested_cols = []
        
        # Separate flat columns from nested columns
        for column_name, dtype in df.dtypes:
            if dtype.startswith("struct"):
                nested_cols.append(column_name)
            else:
                flat_cols.append(column_name)

        # Select flat columns
        selected_cols = [col(column) for column in flat_cols]
        
        # Expand nested columns
        for nested_col in nested_cols:
            expanded = [col(f"{nested_col}.{subfield}").alias(f"{nested_col}_{subfield}")
                        for subfield in df.select(f"{nested_col}.*").columns]
            selected_cols.extend(expanded)
        
        return df.select(*selected_cols)

    # Apply flattening iteratively up to max_depth to handle nested structs within structs
    flat_df = nested_df
    for _ in range(max_depth):
        flat_df = flatten_once(flat_df)
        
        # Check if further flattening is required
        if all(not dtype.startswith("struct") for _, dtype in flat_df.dtypes):
            break  # If no more nested structs, we can stop
        
    return flat_df

In [0]:
def fill_currency_df(spark, currency_bulk_df:DataFrame) -> DataFrame:
    
    """ Fill the missing weekends with the each related friday's data """

    currency_bulk_df = currency_bulk_df.withColumn("day_of_week", date_format(col("date"), "EEEE"))
    friday_df = currency_bulk_df.filter(col("day_of_week") == "Friday").drop("day_of_week")  # Drop day_of_week from Friday data to avoid conflict

    min_date = currency_bulk_df.agg(min("date").alias("min_date")).collect()[0]["min_date"]
    max_date = currency_bulk_df.agg(max("date").alias("max_date")).collect()[0]["max_date"]

    date_range_df = spark.sql(f"SELECT explode(sequence(to_date('{min_date}'), to_date('{max_date}'), interval 1 day)) as date")

    existing_dates_df = currency_bulk_df.select("date").distinct()
    missing_dates_df = date_range_df.join(existing_dates_df, "date", "leftanti")

    # Identify missing Saturdays and Sundays
    missing_weekends_df = missing_dates_df.withColumn("day_of_week", date_format(col("date"), "EEEE")) \
                                     .filter((col("day_of_week") == "Saturday") | (col("day_of_week") == "Sunday"))

    # Find the corresponding Friday's data for each missing weekend date
    weekend_filled_df = missing_weekends_df \
                .withColumn("related_friday", date_sub(col("date"), when(col("day_of_week") == "Saturday", 1).otherwise(2))) \
                .join(friday_df.withColumnRenamed("date", "friday_date"), col("related_friday") == col("friday_date"), "left") \
                .drop("related_friday", "day_of_week", "friday_date")  

    currency_bulk_df = currency_bulk_df.drop('day_of_week')
    filled_df = currency_bulk_df.unionByName(weekend_filled_df)

    return filled_df

In [0]:
def add_currency_buffer(spark, filled_df:DataFrame) -> DataFrame:

    # Get today's and tomorrow's dates
    today_str = datetime.today().strftime('%Y-%m-%d')
    today = datetime.strptime(today_str, '%Y-%m-%d').date()
    tomorrow = today + timedelta(days=1)

    # Get the max date in the source data
    max_date = filled_df.agg({"date": "max"}).collect()[0][0]

    # Check if we need to fill for yesterday, today, or tomorrow
    if max_date < today:
        # Start adding rows from max_date + 1 to tomorrow
        missing_dates = [max_date + timedelta(days=i) for i in range(1, (tomorrow - max_date).days + 1)]

        # Get the rows corresponding to the max_date
        max_rows = filled_df.filter(filled_df.date == max_date)

        # Create new rows for missing dates by duplicating the max_date's rows
        new_rows = [
            (missing_date, row.data) 
            for missing_date in missing_dates 
            for row in max_rows.collect()
        ]

        # Define the schema for the new rows DataFrame
        schema = StructType([
            StructField("date", DateType(), True),
            StructField("data", StructType([
                StructField("since", StringType(), True),
                StructField("source", StringType(), True),
                StructField("source_amount", DoubleType(), True),
                StructField("target", StringType(), True),
                StructField("target_amount", DoubleType(), True),
                StructField("until", StringType(), True)
            ]), True),
        ])

        # Create a new DataFrame with the new rows
        new_rows_df = spark.createDataFrame(new_rows, schema)

        # Union the new rows with the existing DataFrame
        final_df = filled_df.union(new_rows_df)
        return final_df

    # If no missing dates, return the original DataFrame
    return filled_df

In [0]:
def update_currency(spark, df: DataFrame)-> DataFrame:
    df = df.withColumnRenamed("data_target", "currency") \
            .withColumnRenamed("data_target_amount", "currency_amount") \
            .withColumnRenamed("date", "currency_date") \
            .select('currency_date','currency','currency_amount')

    return df

In [0]:
# Fill the missed weekend data
filled_currency_df = fill_currency_df(spark, currency_bulk_df)

# Add buffer days
buffered_currency_df = add_currency_buffer(spark,filled_currency_df)

# Flatten the currency data
flattened_currency_df = flatten_df(spark, buffered_currency_df, 3)

# Update currency df
final_currency_df = update_currency(spark, flattened_currency_df)

In [0]:
final_currency_df.filter(col('currency_date') == '2024-12-26').display()

currency_date,currency,currency_amount
2024-12-26,HRK,7.5345
2024-12-26,USD,1.0395
2024-12-26,JPY,163.25
2024-12-26,BGN,1.9558
2024-12-26,CZK,25.135
2024-12-26,DKK,7.4608
2024-12-26,GBP,0.82805
2024-12-26,HUF,411.73
2024-12-26,PLN,4.2715
2024-12-26,RON,4.9745
