In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, date_format
import datetime

# Start Spark session
spark = SparkSession.builder.appName("DateDimension").getOrCreate()

# Define start and end dates
start_date = datetime.date(2016, 1, 1)
end_date = datetime.date(2020, 12, 31)
num_days = (end_date - start_date).days + 1

# Generate date range
date_df = spark.range(num_days).withColumn(
    "Date", expr(f"date_add('{start_date}', cast(id as int))")
).withColumn(
    "SK_Date", date_format(col("Date"), "yyyyMMdd").cast("int")
)

# Optional columns
date_df = date_df.withColumn("Day", expr("day(Date)")) \
                 .withColumn("Month", expr("month(Date)")) \
                 .withColumn("Year", expr("year(Date)")) \
                 .withColumn("DayOfWeek", expr("dayofweek(Date)")) \
                 .withColumn("DayName", date_format(col("Date"), "EEEE")) \
                 .withColumn("MonthName", date_format(col("Date"), "MMMM")) \
                 .withColumn("IsWeekend", expr("dayofweek(Date) IN (1, 7)"))

# Select and order columns
columns = [
    "SK_Date", "Date", "Day", "DayName", "DayOfWeek",
    "Month", "MonthName", "Year", "IsWeekend"
]
date_df = date_df.select(*columns)

# Show a few rows
date_df.show(5, truncate=False)


+--------+----------+---+--------+---------+-----+---------+----+---------+
|SK_Date |Date      |Day|DayName |DayOfWeek|Month|MonthName|Year|IsWeekend|
+--------+----------+---+--------+---------+-----+---------+----+---------+
|20160101|2016-01-01|1  |Friday  |6        |1    |January  |2016|false    |
|20160102|2016-01-02|2  |Saturday|7        |1    |January  |2016|true     |
|20160103|2016-01-03|3  |Sunday  |1        |1    |January  |2016|true     |
|20160104|2016-01-04|4  |Monday  |2        |1    |January  |2016|false    |
|20160105|2016-01-05|5  |Tuesday |3        |1    |January  |2016|false    |
+--------+----------+---+--------+---------+-----+---------+----+---------+
only showing top 5 rows



In [2]:
date_df.printSchema()

root
 |-- SK_Date: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Day: integer (nullable = true)
 |-- DayName: string (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- MonthName: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- IsWeekend: boolean (nullable = true)

