In [0]:
from pyspark.sql.functions import col, date_sub, current_date, to_date, date_format, when, min, max
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType
from datetime import date, timedelta
from pyspark.sql import DataFrame
from pyspark.sql import Window
from config import get_currency_path

In [0]:
dbutils.widgets.text("env", "test", "")
env = dbutils.widgets.get("env")
currency_source_path, currency_target_path, currency_checkpoint = get_currency_path(env)

In [0]:
# Read the currency data in bulk
currency_bulk_df = (
    spark.read
        .format("delta")  
        .load(currency_source_path)
        .select(
                to_date(col('dt'), "yyyy-MM-dd").alias("date") 
                ,col("data")
                )
        .filter(col("date") >= date_sub(current_date(), 30) ) # Update filter accordingly
)

#### Fill the historical missed data for the weekends , by copying related Fridays's data value

In [0]:
# Extract the day of the week 
currency_bulk_df = currency_bulk_df.withColumn("day_of_week", date_format(col("date"), "EEEE"))

# Filter for Fridays
friday_df = currency_bulk_df.filter(col("day_of_week") == "Friday").drop("day_of_week")  # Drop day_of_week from Friday data to avoid conflict

# Get Min and Max dates
min_date = currency_bulk_df.agg(min("date").alias("min_date")).collect()[0]["min_date"]
max_date = currency_bulk_df.agg(max("date").alias("max_date")).collect()[0]["max_date"]

# Generate a DataFrame with all dates between min and max date (including weekends)
date_range_df = spark.sql(f"SELECT explode(sequence(to_date('{min_date}'), to_date('{max_date}'), interval 1 day)) as date")

# Identify missing dates (Saturdays and Sundays) not present in Delta data
existing_dates_df = currency_bulk_df.select("date").distinct()
missing_dates_df = date_range_df.join(existing_dates_df, "date", "leftanti")

# Identify missing Saturdays and Sundays
missing_weekends_df = missing_dates_df.withColumn("day_of_week", date_format(col("date"), "EEEE")) \
                                      .filter((col("day_of_week") == "Saturday") | (col("day_of_week") == "Sunday"))

# Find the corresponding Friday's data for each missing weekend date
weekend_filled_df = missing_weekends_df \
    .withColumn("related_friday", date_sub(col("date"), when(col("day_of_week") == "Saturday", 1).otherwise(2))) \
    .join(friday_df.withColumnRenamed("date", "friday_date"), col("related_friday") == col("friday_date"), "left") \
    .drop("related_friday", "day_of_week", "friday_date")  


# Drop unnecessary col from the main df and Combine the original data and the new weekend data
currency_bulk_df = currency_bulk_df.drop('day_of_week')
filled_df = currency_bulk_df.unionByName(weekend_filled_df)

#### Insert 1 day data as a buffer by copying the max date data

In [0]:
# Get the data field of the max row
data_value = filled_df.filter(filled_df.date == max_date).select("data").first()[0]
# Create a new date which is one day after max_date
new_date = max_date + timedelta(days=1)
# Get all rows that have the max_date
max_rows = filled_df.filter(filled_df.date == max_date)
# Create new rows using the existing struct data
new_rows = [(new_date, row.data) for row in max_rows.collect()]
# Create a DataFrame for the new rows
schema = StructType([
    StructField("date", DateType(), True),
    StructField("data", StructType([
        StructField("since", StringType(), True),
        StructField("source", StringType(), True),
        StructField("source_amount", DoubleType(), True),
        StructField("target", StringType(), True),
        StructField("target_amount", DoubleType(), True),
        StructField("until", StringType(), True)
    ]), True),
])
new_rows_df = spark.createDataFrame(new_rows, schema)
# Union the original DataFrame with the new rows DataFrame
final_df = filled_df.union(new_rows_df)

In [0]:
#final_df.orderBy(col('date').desc()).display()

In [0]:
# Write the final data to a Delta table
final_df.write.format("delta") \
    .mode("overwrite")  \
    .option("path", currency_target_path) \
    .save()