In [1]:

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfig as cc
cc.setupEnvironment()


In [2]:
spark = cc.startLocalCluster("DATE_DIM",4)
spark.getActiveSession()

In [3]:
from pyspark.sql.functions import *

beginDate = '2009-01-01'
endDate = '2023-12-31'

df_SQL = spark.sql(f"select explode(sequence(to_date('{beginDate}'), to_date('{endDate}'), interval 1 day)) as calendarDate, monotonically_increasing_id() as dateSK ")


df_SQL.createOrReplaceTempView('neededDates' )

spark.sql("select * from neededDates").show()

+------------+------+
|calendarDate|dateSK|
+------------+------+
|  2009-01-01|     0|
|  2009-01-02|     1|
|  2009-01-03|     2|
|  2009-01-04|     3|
|  2009-01-05|     4|
|  2009-01-06|     5|
|  2009-01-07|     6|
|  2009-01-08|     7|
|  2009-01-09|     8|
|  2009-01-10|     9|
|  2009-01-11|    10|
|  2009-01-12|    11|
|  2009-01-13|    12|
|  2009-01-14|    13|
|  2009-01-15|    14|
|  2009-01-16|    15|
|  2009-01-17|    16|
|  2009-01-18|    17|
|  2009-01-19|    18|
|  2009-01-20|    19|
+------------+------+
only showing top 20 rows



In [4]:
dimDate = spark.sql("select dateSK, \
  year(calendarDate) * 10000 + month(calendarDate) * 100 + day(calendarDate) as dateInt, \
  CalendarDate, \
  year(calendarDate) AS CalendarYear, \
  date_format(calendarDate, 'MMMM') as CalendarMonth, \
  month(calendarDate) as MonthOfYear, \
  date_format(calendarDate, 'EEEE') as CalendarDay, \
  dayofweek(calendarDate) AS DayOfWeek, \
  weekday(calendarDate) + 1 as DayOfWeekStartMonday, \
  case \
    when weekday(calendarDate) < 5 then 'Y' \
    else 'N' \
  end as IsWeekDay, \
  dayofmonth(calendarDate) as DayOfMonth, \
  case \
    when calendarDate = last_day(calendarDate) then 'Y' \
    else 'N' \
  end as IsLastDayOfMonth, \
  dayofyear(calendarDate) as DayOfYear, \
  weekofyear(calendarDate) as WeekOfYearIso, \
  quarter(calendarDate) as QuarterOfYear \
from  \
  neededDates \
order by \
  calendarDate")

dimDate.show()

+------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|dateSK| dateInt|CalendarDate|CalendarYear|CalendarMonth|MonthOfYear|CalendarDay|DayOfWeek|DayOfWeekStartMonday|IsWeekDay|DayOfMonth|IsLastDayOfMonth|DayOfYear|WeekOfYearIso|QuarterOfYear|
+------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|     0|20090101|  2009-01-01|        2009|      January|          1|   Thursday|        5|                   4|        Y|         1|               N|        1|            1|            1|
|     1|20090102|  2009-01-02|        2009|      January|          1|     Friday|        6|                   5|        Y|         2|               N|        2|            1|            1|
|     2|20090103|  2009-01-03|        2009|      Januar

In [5]:
#from pyspark.sql.functions import explode, expr, sequence,col, date_format
df_SparkSQL = df_SQL \
    .withColumn("dateInt", (year(col("calendarDate")) * 10000 + month(col("calendarDate")) * 100 + dayofmonth(col("calendarDate")))) \
    .withColumn("CalendarDate", col("calendarDate")) \
    .withColumn("CalendarYear", year(col("calendarDate"))) \
    .withColumn("CalendarMonth", date_format(col("calendarDate"), 'MMMM')) \
    .withColumn("MonthOfYear", month(col("calendarDate"))) \
    .withColumn("CalendarDay", date_format(col("calendarDate"), 'EEEE')) \
    .withColumn("DayOfWeek", dayofweek(col("calendarDate"))) \
    .withColumn("DayOfWeekStartMonday", expr("weekday(calendarDate) + 1")) \
    .withColumn("IsWeekDay", when(expr("weekday(calendarDate) < 5"), "Y").otherwise("N")) \
    .withColumn("DayOfMonth", dayofmonth(col("calendarDate"))) \
    .withColumn("IsLastDayOfMonth", when(col("calendarDate") == expr("last_day(calendarDate)"), "Y").otherwise("N")) \
    .withColumn("DayOfYear", dayofyear(col("calendarDate"))) \
    .withColumn("WeekOfYearIso", weekofyear(col("calendarDate"))) \
    .withColumn("QuarterOfYear", quarter(col("calendarDate"))) \
    .orderBy(col("calendarDate"))
df_SparkSQL.show()

+------------+------+--------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|CalendarDate|dateSK| dateInt|CalendarYear|CalendarMonth|MonthOfYear|CalendarDay|DayOfWeek|DayOfWeekStartMonday|IsWeekDay|DayOfMonth|IsLastDayOfMonth|DayOfYear|WeekOfYearIso|QuarterOfYear|
+------------+------+--------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|  2009-01-01|     0|20090101|        2009|      January|          1|   Thursday|        5|                   4|        Y|         1|               N|        1|            1|            1|
|  2009-01-02|     1|20090102|        2009|      January|          1|     Friday|        6|                   5|        Y|         2|               N|        2|            1|            1|
|  2009-01-03|     2|20090103|        2009|      Januar

In [6]:
dimDate.write.format("delta").mode("overwrite").saveAsTable("dimDate")


In [7]:
spark.stop()