# Create Dimensions for Date and Time

## Description
- Create table for date dimension
- Create table for time dimension in minutes
- Create table which combines the dimensions date and timeminute in one table
- Create days_off table
- Append the compensation_holidays to date dimension

## ToDo: Remove Hardcoded code!

## Preparation

In [0]:
%run ../../utilities/altyca_utility

In [0]:
alu = Utility()

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import json
from datetime import datetime

In [0]:
dbutils.widgets.text("catalog", "altyca_demo", "Catalog")
dbutils.widgets.text("start_date", "2020-01-01", "Start Date")
dbutils.widgets.text("end_date", "2030-12-31", "End Date")

In [0]:
environment = alu.get_environment()
# catalog = f"{dbutils.widgets.get('catalog')}_{environment}"
catalog = f"{dbutils.widgets.get('catalog')}"
start_date = dbutils.widgets.get("start_date")
end_date = dbutils.widgets.get("end_date")

schema = "shared_dimensions"
dim_date_table = f"{catalog}.{schema}.dim_date"
days_off_table = f"{catalog}.{schema}.days_off"
time_minute_table = f"{catalog}.{schema}.dim_time_minute"
mapping_minute_table = f"{catalog}.{schema}.dim_mapping_minute"
comment = f"This object is created using the Job Dim Date Deploy Job and contains the date up to date {start_date} to {end_date}"

In [0]:
print(f"Create Dimension Tables for Date and Time")
print(f"Timerange: {start_date} to {end_date}")
print(f"Catalog: {catalog}")
print(f"Schema: {schema}")
print(f"Dim Date Table: {dim_date_table}")
print(f"Days Off Table: {days_off_table}")
print(f"Time Minute Table: {time_minute_table}")
print(f"Mapping Minute Table: {mapping_minute_table}")
print(f"This Notebook runs in Environment: {environment}")

In [0]:
# Translation and abbreviated translation for German, French, and Italian --> create more translations as wished
months_dict = {
    "January": ["Januar", "Jan.", "janvier", "janv.", "gennaio", "gen."],
    "February": ["Februar", "Feb.", "février", "fév.", "febbraio", "feb."],
    "March": ["März", "Mär.", "mars", "mar.", "marzo", "mar."],
    "April": ["April", "Apr.", "avril", "avr.", "aprile", "apr."],
    "May": ["Mai", "Mai.", "mai", "mai.", "maggio", "mag."],
    "June": ["Juni", "Jun.", "juin", "jui.", "giugno", "giu."],
    "July": ["Juli", "Jul.", "juillet", "juil.", "luglio", "lug."],
    "August": ["August", "Aug.", "août", "aoû.", "agosto", "ago."],
    "September": ["September", "Sep.", "septembre", "sept.", "settembre", "set."],
    "October": ["Oktober", "Okt.", "octobre", "oct.", "ottobre", "ott."],
    "November": ["November", "Nov.", "novembre", "nov.", "novembre", "nov."],
    "December": ["Dezember", "Dez.", "décembre", "déc.", "dicembre", "dic."],
}

# Translation and abbreviated translation for German, French, and Italian
days_dict = {
    "Monday": ["Montag", "Mo.", "lundi", "lun.", "lunedì", "lun."],
    "Tuesday": ["Dienstag", "Di.", "mardi", "mar.", "martedì", "mar."],
    "Wednesday": ["Mittwoch", "Mi.", "mercredi", "mer.", "mercoledì", "mer."],
    "Thursday": ["Donnerstag", "Do.", "jeudi", "jeu.", "giovedì", "gio."],
    "Friday": ["Freitag", "Fr.", "vendredi", "ven.", "venerdì", "ven."],
    "Saturday": ["Samstag", "Sa.", "samedi", "sam.", "sabato", "sab."],
    "Sunday": ["Sonntag", "So.", "dimanche", "dim.", "domenica", "dom."],
}

date_mapping_dict = {"Months": months_dict, "Days": days_dict}


# Define the nested mapping dictionary for different window groups
time_mapping_dict = {
    "5Min": {
        1: "00-05",
        2: "05-10",
        3: "10-15",
        4: "15-20",
        5: "20-25",
        6: "25-30",
        7: "30-35",
        8: "35-40",
        9: "40-45",
        10: "45-50",
        11: "50-55",
        12: "55-60",
    },
    "10Min": {1: "00-10", 2: "10-20", 3: "20-30", 4: "30-40", 5: "40-50", 6: "50-60"},
    "15Min": {1: "00-15", 2: "15-30", 3: "30-45", 4: "45-60"},
    "20Min": {1: "00-20", 2: "20-40", 3: "40-60"},
    "30Min": {1: "00-30", 2: "30-60"},
    "60Min": {1: "00-60"},
}

## Define function(s)

In [0]:
# Create a UDF to map the time values to the corresponding window group
get_window_group = udf(
    lambda minute, window_type: time_mapping_dict[window_type][
        next(
            (
                group
                for group, window in time_mapping_dict[window_type].items()
                if int(window.split("-")[0]) <= minute < int(window.split("-")[1])
            ),
            0,
        )
    ]
)


# function to create time windows
def create_time_window(
    df,
    duration,
    timestampCol="TimestampStartUtc",
    minuteCol="TimeMinute",
    secondCol="TimeSecond",
):
    """
    to a given dataframe this function add 3 columns:
    - if a minute is a start of a window: true or false
    - an Id of the window group: int
    - a name for the window group: string
    """

    is_window_start = f"IsWindow{duration}MinuteStart"
    window_id = f"Window{duration}MinuteId"
    window_name = f"Window{duration}MinuteName"

    df = (
        df.withColumn(
            is_window_start,
            when(
                (col(minuteCol) % duration == 0) & (col(secondCol) == 0), True
            ).otherwise(False),
        )
        .withColumn(window_id, ceil((col(minuteCol) + 1) / duration).cast("integer"))
        .withColumn(window_name, get_window_group(minuteCol, lit(f"{duration}Min")))
    )
    return df


def add_date_columns(df):
    """
    function to add date related columns
    The function returns the calculated dataframe and the added date_columns
    """
    # date
    df = df.withColumn(
        "DatePK", expr("year(Date) * 10000 + month(Date) * 100 + day(Date)")
    ).withColumn("DateString", expr("date_format(Date, 'dd.MM.yyyy')"))

    # year
    df = (
        df.withColumn("Year", year(col("Date"))).withColumn(
            "IsLeapYear",
            when(
                (expr("year(Date) % 4 == 0") & expr("year(Date) % 100 != 0"))
                | expr("year(Date) % 400 == 0"),
                True,
            ).otherwise(False),
        )
        # How to calculate leap year: https://learn.microsoft.com/en-us/office/troubleshoot/excel/determine-a-leap-year
    )

    # halfyear
    df = (
        df.withColumn(
            "HalfYearId", when(month(df["Date"]).between(1, 6), 1).otherwise(2)
        )
        .withColumn("YearHalfYearId", col("Year") * 10 + col("HalfYearId"))
        .withColumn(
            "HalfYearName", when(month(df["Date"]).between(1, 6), "H1").otherwise("H2")
        )
        .withColumn("YearHalfYear", expr("concat(Year,'-', HalfYearName)"))
    )

    # quarter
    df = (
        df.withColumn("Quarter", expr("quarter(Date)"))
        .withColumn("YearQuarterId", expr("year(Date) * 10 + quarter(Date)"))
        .withColumn("YearQuarter", expr("concat(Year,'-Q', quarter(Date))"))
        .withColumn("QuarterShortname", expr("concat('Q', quarter(Date))"))
    )

    # month
    df = (
        df.withColumn("YearMonthId", expr("CAST(date_format(Date, 'yyyyMM') AS INT)"))
        .withColumn("Month", month("Date"))
        .withColumn("MonthName", expr("date_format(Date, 'MMMM')"))
        .withColumn("MonthNameShort", expr("date_format(Date, 'MMM')"))
        .withColumn(
            "MonthNameDe",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN MonthName == '{0}' THEN '{1}'".format(k, v[0])
                        for k, v in date_mapping_dict["Months"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "MonthNameShortDe",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN MonthName == '{0}' THEN '{1}'".format(k, v[1])
                        for k, v in date_mapping_dict["Months"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "MonthNameFr",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN MonthName == '{0}' THEN '{1}'".format(k, v[2])
                        for k, v in date_mapping_dict["Months"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "MonthNameShortFr",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN MonthName == '{0}' THEN '{1}'".format(k, v[3])
                        for k, v in date_mapping_dict["Months"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "MonthNameIt",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN MonthName == '{0}' THEN '{1}'".format(k, v[4])
                        for k, v in date_mapping_dict["Months"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "MonthNameShortIt",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN MonthName == '{0}' THEN '{1}'".format(k, v[5])
                        for k, v in date_mapping_dict["Months"].items()
                    ]
                )
                + " END"
            ),
        )
    )

    # week
    df = (
        df.withColumn("WeekIdIso", expr("year(Date) * 100 + extract(WEEKS FROM Date)"))
        .withColumn("WeekOfYearIso", expr("extract(WEEKS FROM Date)"))
        .withColumn("WeekOfYearIsoName", expr("concat('W', WeekOfYearIso)"))
        # the number of the ISO 8601 week-of-week-based-year. A week is considered to start on a Monday and week 1 is the first week with >3 days. In the ISO week-numbering system, it is possible for early-January dates to be part of the 52nd or 53rd week of the previous year, and for late-December dates to be part of the first week of the next year. For example, 2005-01-02 is part of the 53rd week of year 2004,  while 2012-12-31 is part of the first week of 2013.
        .withColumn("YearWeekOfYearIso", expr("concat(Year,'-', WeekOfYearIsoName)"))
    )

    # day
    df = (
        df.withColumn("DayName", expr("date_format(Date, 'EEEE')"))
        .withColumn("DayNameShort", expr("date_format(Date, 'EEE')"))
        .withColumn(
            "DayNameDe",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN DayName == '{0}' THEN '{1}'".format(k, v[0])
                        for k, v in date_mapping_dict["Days"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "DayNameShortDe",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN DayName == '{0}' THEN '{1}'".format(k, v[1])
                        for k, v in date_mapping_dict["Days"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "DayNameFr",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN DayName == '{0}' THEN '{1}'".format(k, v[2])
                        for k, v in date_mapping_dict["Days"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "DayNameShortFr",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN DayName == '{0}' THEN '{1}'".format(k, v[3])
                        for k, v in date_mapping_dict["Days"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "DayNameIt",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN DayName == '{0}' THEN '{1}'".format(k, v[4])
                        for k, v in date_mapping_dict["Days"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn(
            "DayNameShortIt",
            expr(
                "CASE "
                + " ".join(
                    [
                        "WHEN DayName == '{0}' THEN '{1}'".format(k, v[5])
                        for k, v in date_mapping_dict["Days"].items()
                    ]
                )
                + " END"
            ),
        )
        .withColumn("DayOfYear", expr("dayofyear(Date)"))
        .withColumn("Day", expr("dayofmonth(Date)"))
        .withColumn("DayOfWeekUs", expr("extract(DOW FROM Date)"))
        .withColumn("DayOfWeekIso", expr("extract(DOW_ISO FROM Date)"))
        .withColumn("IsWeekDay", expr("DayOfWeekIso < 6"))
        .withColumn("IsLastDayOfMonth", expr("Date = last_day(Date)"))
        .withColumn("LastDayOfMonth", expr("last_day(Date)"))
        .withColumn("MonthDay", expr("make_date(1972, Month, Day)"))
        .withColumn("StartOfWeekUs", expr("date_sub(Date, DayOfWeekUs-1)"))
        .withColumn("StartOfWeekIso", expr("date_sub(Date, DayOfWeekIso-1)"))
        .withColumn("EndOfWeekUs", expr("date_add(Date, 7-DayOfWeekUs)"))
        .withColumn("EndOfWeekIso", expr("date_add(Date, 7-DayOfWeekIso)"))
        .withColumn("IsCompensation", lit(None).cast("boolean"))
        .withColumn("IsHoliday", lit(None).cast("boolean"))
    )

    date_columns = df.columns

    # return results
    return df, date_columns


def add_time_columns(df):
    """
    function to add time related columns
    The function returns the calculated dataframe and the added time_columns
    """
    # generate timestamps from TimePK
    df = (
        df.withColumn("TimestampStartUtc", col("TimestampPK").cast("timestamp"))
        # .withColumn("TimestampStartCet", from_utc_timestamp("TimestampStartUtc", "Europe/Zurich"))
        .withColumn("TimePK", expr("TimestampPK%86400"))
        .withColumn("Time", col("TimePK").cast("timestamp"))
        .withColumn("TimeString", date_format(col("Time"), "HH:mm:ss"))
    )

    # add time columns
    df = (
        df.withColumn("TimeHour", hour("Time"))
        .withColumn("TimeMinute", minute("Time"))
        .withColumn("TimeSecond", second("Time"))
        .withColumn("AmPm", date_format("Time", "a"))
    )

    # add minute and second identifiers
    df = df.withColumn(
        "TimeSecondId", expr("TimeHour*10000+TimeMinute*100+TimeSecond")
    ).withColumn("TimeMinuteId", expr("TimeHour*100+TimeMinute"))

    # Create windows
    windows = [5, 10, 15, 20, 30, 60]
    for window in windows:
        df = create_time_window(df, window)

    time_columns = df.columns

    # return results
    return df, time_columns


def expand_date_to_datetime_table(df_date, resolution_in_seconds=1):
    """
    given an input dataframe df_date for a certain date range
    we calculate the dateTime table with <resolution_in_seconds>.
    The function returns the calculated dataframe and the added time_columns
    """
    date_to_seconds_range = (
        df_date.select(to_timestamp("Date").cast(LongType()).alias("Seconds"))
        .groupBy(lit(1))
        .agg(min("Seconds").alias("min"), max("Seconds").alias("max"))
        .select("min", "max")
    ).collect()[0]

    # Create range for all seconds in 24h
    df = spark.range(
        date_to_seconds_range.min, date_to_seconds_range.max + 1, resolution_in_seconds
    ).select(col("id").cast(LongType()).alias("TimestampPK"))

    # add time columns
    df, time_columns = add_time_columns(df)

    # add date columns
    df, _ = add_date_columns(df.withColumn("Date", to_date("TimestampStartUtc")))

    # rename the Date and WeekTime PK to FK (Foreign Key)
    df = df.withColumnRenamed("DatePK", "DateUtcFK").withColumnRenamed(
        "WeekTimeUtcPK", "WeekTimeUtcFK"
    )

    # add local date and week time columns
    df = (
        df
        # .withColumn("DateCet", to_date("TimestampStartCet"))
        # .withColumn("DateCetPK", expr("year(DateCet) * 10000 + month(DateCet) * 100 + day(DateCet)"))
        .withColumn("WeekTimePK", (col("DayOfWeekIso") - 1) * 86400 + col("TimePK"))
        .withColumn(
            "WeekTime",
            to_timestamp(
                to_timestamp(lit("1970-01-05")).cast(LongType()) + col("WeekTimePK")
            ),
        )
        .withColumn("WeekTimeDuration", col("WeekTimePK") / 86400.0)
        .withColumn("DayTimeDuration", col("TimePK") / 86400.0)
    )

    # return results
    return df, time_columns


## Create dimension date

In [0]:
# Create the dataframe with date interval
df_date_range = spark.sql(
    f"SELECT EXPLODE(SEQUENCE(to_date('{start_date}'), to_date('{end_date}'), INTERVAL 1 DAY)) AS Date"
)

# add date columns
df_date_range, date_columns = add_date_columns(df_date_range)

In [0]:
display(df_date_range)

## Create dimension time in minutes

In [0]:
df_time_range_seconds, time_columns = expand_date_to_datetime_table(
    df_date=df_date_range, resolution_in_seconds=1
)
df_time_range_minutes, time_columns = expand_date_to_datetime_table(
    df_date=df_date_range, resolution_in_seconds=60
)

## Prepare Mapping Table

In [0]:
date_timestamp_columns = [
    "DateUtcFK",
    "TimestampStartUtc",
    "WeekTimePK",
] 

df_mapping_minutes = df_time_range_minutes.select(date_timestamp_columns)

## Prepare Time Table


In [0]:
week_time_filter_query = f"Year=year('{start_date}') AND Month=month('{start_date}')"

df_time_seconds = (
    df_time_range_seconds.filter(week_time_filter_query)
    .select(
        ["WeekTimePK", "WeekTime", "WeekTimeDuration", "DayTimeDuration"] + time_columns
    )
    .drop("TimestampPK", "TimestampStartUtc")  # , 'TimestampStartCet'
    .distinct()
)

df_time_minutes = (
    df_time_range_minutes.filter(week_time_filter_query)
    .select(df_time_seconds.columns)
    .distinct()
)

In [0]:
# Drop table if exists and create new with comment
spark.sql(f"DROP TABLE IF EXISTS {dim_date_table}")

df_date_range.write.format("delta").mode("overwrite").option(
    "overwriteSchema", "true"
).option("comment", comment).saveAsTable(dim_date_table)

In [0]:
# Drop table if exists and create new with comment
spark.sql(f"DROP TABLE IF EXISTS {time_minute_table}")
df_time_minutes.write.format("delta").mode("overwrite").option(
    "overwriteSchema", "true"
).saveAsTable(time_minute_table)

In [0]:
# Drop table if exists and create new with comment
spark.sql(f"DROP TABLE IF EXISTS {mapping_minute_table}")

df_mapping_minutes.write.format("delta").mode("overwrite").option(
    "overwriteSchema", "true"
).option("comment", comment).saveAsTable(mapping_minute_table)

## Create View

In [0]:
sql = f"""
CREATE OR REPLACE VIEW {catalog}.{schema}.vw_dim_date_minute 
COMMENT '{comment}' AS
  SELECT
    dd.Date,
    dd.DatePK,
    dd.DateString,
    dd.Year,
    dd.IsLeapYear,
    dd.HalfYearId,
    dd.YearHalfYearId,
    dd.HalfYearName,
    dd.YearHalfYear,
    dd.Quarter,
    dd.YearQuarterId,
    dd.YearQuarter,
    dd.QuarterShortname,
    dd.YearMonthId,
    dd.Month,
    dd.MonthName,
    dd.MonthNameShort,
    dd.MonthNameDe,
    dd.MonthNameShortDe,
    dd.MonthNameFr,
    dd.MonthNameShortFr,
    dd.MonthNameIt,
    dd.MonthNameShortIt,
    dd.WeekIdIso,
    dd.WeekOfYearIso,
    dd.WeekOfYearIsoName,
    dd.YearWeekOfYearIso,
    dd.DayName,
    dd.DayNameShort,
    dd.DayNameDe,
    dd.DayNameShortDe,
    dd.DayNameFr,
    dd.DayNameShortFr,
    dd.DayNameIt,
    dd.DayNameShortIt,
    dd.DayOfYear,
    dd.Day,
    dd.DayOfWeekUs,
    dd.DayOfWeekIso,
    dd.IsWeekDay,
    dd.IsLastDayOfMonth,
    dd.LastDayOfMonth,
    dd.MonthDay,
    dd.StartOfWeekUs,
    dd.StartOfWeekIso,
    dd.EndOfWeekUs,
    dd.EndOfWeekIso,
    dd.IsCompensation,
    dd.IsHoliday,

    tm.WeekTimePK,
    tm.WeekTime,
    tm.WeekTimeDuration,
    tm.DayTimeDuration,
    tm.TimePK,
    tm.Time,
    tm.TimeString,
    tm.TimeHour,
    tm.TimeMinute,
    tm.TimeSecond,
    tm.AmPm,
    tm.TimeSecondId,
    tm.TimeMinuteId,
    tm.IsWindow5MinuteStart,
    tm.Window5MinuteId,
    tm.Window5MinuteName,
    tm.IsWindow10MinuteStart,
    tm.Window10MinuteId,
    tm.Window10MinuteName,
    tm.IsWindow15MinuteStart,
    tm.Window15MinuteId,
    tm.Window15MinuteName,
    tm.IsWindow20MinuteStart,
    tm.Window20MinuteId,
    tm.Window20MinuteName,
    tm.IsWindow30MinuteStart,
    tm.Window30MinuteId,
    tm.Window30MinuteName,
    tm.IsWindow60MinuteStart,
    tm.Window60MinuteId,
    tm.Window60MinuteName

  FROM {catalog}.{schema}.dim_date AS dd
  LEFT OUTER JOIN {catalog}.{schema}.dim_mapping_minute AS mm ON dd.DatePK = mm.DateUtcFK
  LEFT OUTER JOIN {catalog}.{schema}.dim_time_minute AS tm ON mm.WeekTimePK = tm.WeekTimePK
"""
spark.sql(sql).display()

In [0]:
sql = f"""
CREATE OR REPLACE VIEW {catalog}.{schema}.vw_dim_date_5_minute 
COMMENT '{comment}' AS
SELECT * FROM {catalog}.{schema}.vw_dim_date_minute
WHERE IsWindow5MinuteStart = true
"""
spark.sql(sql).display()

In [0]:
sql = f"""
CREATE OR REPLACE VIEW {catalog}.{schema}.vw_dim_date_10_minute 
COMMENT '{comment}' AS
SELECT * FROM {catalog}.{schema}.vw_dim_date_minute
WHERE IsWindow10MinuteStart = true

"""
spark.sql(sql).display()

In [0]:
sql = f"""
CREATE OR REPLACE VIEW {catalog}.{schema}.vw_dim_date_15_minute 
COMMENT '{comment}' AS
SELECT * FROM {catalog}.{schema}.vw_dim_date_minute
WHERE IsWindow15MinuteStart = true
"""
spark.sql(sql).display()

In [0]:
sql = f"""
CREATE OR REPLACE VIEW {catalog}.{schema}.vw_dim_date_20_minute 
COMMENT '{comment}' AS
SELECT * FROM {catalog}.{schema}.vw_dim_date_minute
WHERE IsWindow20MinuteStart = true
"""
spark.sql(sql).display()

In [0]:
sql = f"""
CREATE OR REPLACE VIEW {catalog}.{schema}.vw_dim_date_30_minute 
COMMENT '{comment}' AS
SELECT * FROM {catalog}.{schema}.vw_dim_date_minute
WHERE IsWindow30MinuteStart = true
"""
spark.sql(sql).display()

In [0]:
sql = f"""
CREATE OR REPLACE VIEW {catalog}.{schema}.vw_dim_date_60_minute 
COMMENT '{comment}' AS
SELECT * FROM {catalog}.{schema}.vw_dim_date_minute
WHERE IsWindow60MinuteStart = true
"""
spark.sql(sql).display()


# Append the days off

In [0]:
print(f"Create Days-Off Table: {days_off_table}")

sql = f"""
    DROP TABLE IF EXISTS {days_off_table}
"""
spark.sql(sql).display()

In [0]:
sql = f"""
    CREATE TABLE IF NOT EXISTS {days_off_table} (
        Id BIGINT GENERATED ALWAYS AS IDENTITY,
        Date DATE, 
        Type STRING, 
        Comment STRING,
        
        PRIMARY KEY (Id)
    )
    COMMENT '{comment}'
"""
spark.sql(sql).display()

In [0]:
with open("dates.json", "r") as file:
    all_holidays = json.load(file)

# Insert holidays into the table
values = ",\n        ".join(
    [
        f"('{holiday['date']}', '{holiday['type']}', '{holiday['name']}')"
        for holiday in all_holidays
    ]
)

sql = f"""
    INSERT INTO {days_off_table} (Date, Type, Comment)
    VALUES 
        {values}
"""
spark.sql(sql).display()

In [0]:
display(values)

In [0]:
display(days_off_table)

In [0]:
sql = f"SELECT * FROM {days_off_table}"
spark.sql(sql).display()

In [0]:
holiday_df = spark.sql(
    f"SELECT Date FROM {days_off_table} WHERE Type != 'Kompensation'"
)
holiday_dates = [row.Date for row in holiday_df.collect()]
display(holiday_dates)

In [0]:
update_query = f"""
    UPDATE {dim_date_table}
    SET IsHoliday = True
    WHERE Date IN ({','.join([f"'{date}'" for date in holiday_dates])})
"""
spark.sql(update_query).display()

In [0]:
compensation_df = spark.sql(
    f"SELECT Date FROM {days_off_table} WHERE Type = 'Kompensation'"
)
compensation_dates = [row.Date for row in compensation_df.collect()]

In [0]:
# update_query = f"""
#     UPDATE {dim_date_table}
#     SET IsCompensation = True
#     WHERE Date IN ({','.join([f"'{date}'" for date in compensation_dates])})
# """
# spark.sql(update_query).display()

In [0]:
reorder_query = f"""
    SELECT * FROM {dim_date_table}
    ORDER BY Date
"""
reordered_df = spark.sql(reorder_query)
reordered_df.write.mode("overwrite").saveAsTable(dim_date_table)

In [0]:
dbutils.notebook.exit("Success")

In [0]:
sql = f"""SELECT * FROM {dim_date_table}"""
spark.sql(sql).display()