In [1]:

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import *

import ConnectionConfig as cc
cc.setupEnvironment()


In [2]:
spark = cc.startLocalCluster("DATE_DIM",4)
spark.getActiveSession()

In [3]:
#EXTRACT

beginDate = '2009-01-01'
endDate = '2023-12-31'

df_SQL = spark.sql(f"select explode(sequence(to_date('{beginDate}'), to_date('{endDate}'), interval 1 day)) as calendarDate, monotonically_increasing_id() as dateSK ")


df_SQL.createOrReplaceTempView('neededDates' )

spark.sql("select * from neededDates").show()

+------------+------+
|calendarDate|dateSK|
+------------+------+
|  2009-01-01|     0|
|  2009-01-02|     1|
|  2009-01-03|     2|
|  2009-01-04|     3|
|  2009-01-05|     4|
|  2009-01-06|     5|
|  2009-01-07|     6|
|  2009-01-08|     7|
|  2009-01-09|     8|
|  2009-01-10|     9|
|  2009-01-11|    10|
|  2009-01-12|    11|
|  2009-01-13|    12|
|  2009-01-14|    13|
|  2009-01-15|    14|
|  2009-01-16|    15|
|  2009-01-17|    16|
|  2009-01-18|    17|
|  2009-01-19|    18|
|  2009-01-20|    19|
+------------+------+
only showing top 20 rows



In [4]:
#TRANSFORM
dimDate = spark.sql("""
select
  year(calendarDate) * 10000 + month(calendarDate) * 100 + day(calendarDate) as date_sk,
  calendarDate as date,
  year(calendarDate) as year,
  date_format(calendarDate, 'MMMM') as month_name,
  month(calendarDate) as month_nr,
  date_format(calendarDate, 'EEEE') as day_name,
  dayofweek(calendarDate) as day_nr,
  case
    when weekday(calendarDate) < 5 then 'Y'
    else 'N'
  end as is_weekday,
  quarter(calendarDate) as quarter
from neededDates
order by calendarDate
""")

dimDate.printSchema()

root
 |-- date_sk: integer (nullable = false)
 |-- date: date (nullable = false)
 |-- year: integer (nullable = false)
 |-- month_name: string (nullable = false)
 |-- month_nr: integer (nullable = false)
 |-- day_name: string (nullable = false)
 |-- day_nr: integer (nullable = false)
 |-- is_weekday: string (nullable = false)
 |-- quarter: integer (nullable = false)



In [5]:
#from pyspark.sql.functions import explode, expr, sequence,col, date_format
df_SparkSQL = df_SQL \
    .withColumn("date_sk", (year(col("calendarDate")) * 10000 + month(col("calendarDate")) * 100 + dayofmonth(col("calendarDate")))) \
    .withColumn("date", col("calendarDate")) \
    .withColumn("year", year(col("calendarDate"))) \
    .withColumn("month_name", date_format(col("calendarDate"), 'MMMM')) \
    .withColumn("month_nr", month(col("calendarDate"))) \
    .withColumn("day_name", date_format(col("calendarDate"), 'EEEE')) \
    .withColumn("day_nr", dayofweek(col("calendarDate"))) \
    .withColumn("is_weekday", when(dayofweek(col("calendarDate")).between(2, 6), "Y").otherwise("N")) \
    .withColumn("quarter", quarter(col("calendarDate"))) \
    .orderBy(col("date"))

df_SparkSQL.show()

+------------+------+--------+----------+----+----------+--------+---------+------+----------+-------+
|calendarDate|dateSK| date_sk|      date|year|month_name|month_nr| day_name|day_nr|is_weekday|quarter|
+------------+------+--------+----------+----+----------+--------+---------+------+----------+-------+
|  2009-01-01|     0|20090101|2009-01-01|2009|   January|       1| Thursday|     5|         Y|      1|
|  2009-01-02|     1|20090102|2009-01-02|2009|   January|       1|   Friday|     6|         Y|      1|
|  2009-01-03|     2|20090103|2009-01-03|2009|   January|       1| Saturday|     7|         N|      1|
|  2009-01-04|     3|20090104|2009-01-04|2009|   January|       1|   Sunday|     1|         N|      1|
|  2009-01-05|     4|20090105|2009-01-05|2009|   January|       1|   Monday|     2|         Y|      1|
|  2009-01-06|     5|20090106|2009-01-06|2009|   January|       1|  Tuesday|     3|         Y|      1|
|  2009-01-07|     6|20090107|2009-01-07|2009|   January|       1|Wednesd

In [6]:
#LOAD
dimDate.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("date_dim")


In [7]:
spark.stop()