In [1]:
from pyspark.sql.functions import col, explode, sequence, to_date, year, quarter, month, date_format, dayofweek

print("Calculating date range from Gold data...")
# 1. Get Min and Max dates dynamically from your actual data
date_range = spark.sql("SELECT MIN(date) as min_date, MAX(date) as max_date FROM StockMarket_Gold").collect()[0]
min_date = date_range['min_date']
max_date = date_range['max_date']

# Optional: Extend the range to ensure full years (e.g., start from Jan 1)
from datetime import date, timedelta
start_date = date(min_date.year, 1, 1)
end_date = date(max_date.year, 12, 31)
print(f"Generating Date Dimension from {start_date} to {end_date}...")

# 2. Generate the sequence of dates
days_df = spark.sql(f"SELECT explode(sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day)) as date")

# 3. Add all standard calendar columns (The equivalent of your DAX)
dim_date_df = days_df.select(
    col("date"),
    year("date").alias("Year"),
    quarter("date").alias("QuarterNumber"),
    date_format("date", "Q").alias("Quarter"),
    month("date").alias("MonthNumber"),
    date_format("date", "MMMM").alias("Month"),
    date_format("date", "MMM").alias("MonthShort"),
    dayofweek("date").alias("DayOfWeekNumber"), # Note: PySpark dayofweek usually 1=Sunday, 7=Saturday. Adjust if needed.
    date_format("date", "EEEE").alias("DayOfWeek"),
    date_format("date", "E").alias("DayOfWeekShort"),
    date_format("date", "yyyy-MM").alias("YearMonth"),
    date_format("date", "yyyy-'Q'Q").alias("YearQuarter")
)

# 4. Write to Lakehouse as a physical Delta table
dim_date_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("DimDate")

print("Success! 'DimDate' table created in Lakehouse.")

StatementMeta(, 23a34340-3f34-4e26-b079-93acf88b367c, 3, Finished, Available, Finished)

Calculating date range from Gold data...
Generating Date Dimension from 2020-01-01 to 2025-12-31...
Success! 'DimDate' table created in Lakehouse.
