In [0]:
%sql
USE cscie103_catalog.final_project

In [0]:
%sql
SHOW tables

In [0]:
%sql
SELECT * FROM bronze_stores;

In [0]:
from pyspark.sql import functions as F

# Load tables as DataFrames
gold_df = spark.table("cscie103_catalog.final_project.gold_daily_store_family")
test_df = spark.table("cscie103_catalog.final_project.silver_test_predictions")
val_df  = spark.table("cscie103_catalog.final_project.silver_validation_predictions")

In [0]:

# Actual sales
actual_sales_df = (
    gold_df
    .select(
        F.col("date"),
        F.col("store_nbr"),
        F.col("family"),
        F.col("sales").alias("actual_sales"),
        F.lit("Actual").alias("data_type")
    )
)
actual_sales_df.show()

In [0]:

# Test forecasts
test_predictions_df = (
    test_df
    .select(
        F.col("date"),
        F.col("store_nbr"),
        F.col("family"),
        F.col("predicted_sales").alias("actual_sales"),
        F.concat(F.lit("Forecast - "), F.col("scenario")).alias("data_type")
    )
)
test_predictions_df.show()

In [0]:
# Validation forecasts
validation_predictions_df = (
    val_df
    .select(
        F.col("date"),
        F.col("store_nbr"),
        F.col("family"),
        F.col("predicted_sales").alias("actual_sales"),
        F.lit("Validation").alias("data_type")
    )
)
validation_predictions_df.show()


In [0]:
# Union all and sort
combined_sales_df = (
    actual_sales_df
    .unionByName(test_predictions_df)
    .unionByName(validation_predictions_df)
    .orderBy("date", "store_nbr", "family", "data_type")
)

combined_sales_df.show()


In [0]:
combined_sales_df.toPandas().pivot_table(index='data_type', values='date', aggfunc=['min', 'max'])

In [0]:
from pyspark.sql import functions as F

def calculate_ytd_growth_vs_last_year(df, group_cols):
    """
    Calculate YTD growth vs same YTD period last year.
    Aggregated per group (no day/month output).
    """

    # Add year and day-of-year
    df = (
        df
        .withColumn("year", F.year("date"))
        .withColumn("doy", F.dayofyear("date"))
    )

    # 1. Get latest year
    latest_year = (
        df
        .select(F.max("year").alias("latest_year"))
        .collect()[0]["latest_year"]
    )

    # 2. Get latest available day-of-year for that year
    latest_doy = (
        df
        .filter(F.col("year") == latest_year)
        .select(F.max("doy").alias("latest_doy"))
        .collect()[0]["latest_doy"]
    )

    # 3. Filter to YTD window (current + prior year)
    ytd_df = df.filter(
        (
            (F.col("year") == latest_year) |
            (F.col("year") == latest_year - 1)
        ) &
        (F.col("doy") <= latest_doy)
    )

    # 4. Aggregate YTD sales
    aggregated = (
        ytd_df
        .groupBy(group_cols + ["year"])
        .agg(F.sum("actual_sales").alias("ytd_sales"))
    )

    # 5. Split years
    current_year = (
        aggregated
        .filter(F.col("year") == latest_year)
        .select(*group_cols, F.col("ytd_sales").alias("ytd_sales_current"))
    )

    prior_year = (
        aggregated
        .filter(F.col("year") == latest_year - 1)
        .select(*group_cols, F.col("ytd_sales").alias("ytd_sales_prior"))
    )

    # 6. Join and calculate growth
    result = (
        current_year
        .join(prior_year, on=group_cols, how="left")
        .withColumn(
            "ytd_growth_abs",
            F.col("ytd_sales_current") - F.col("ytd_sales_prior")
        )
        .withColumn(
            "ytd_growth_pct",
            F.when(
                F.col("ytd_sales_prior") != 0,
                F.col("ytd_growth_abs") / F.col("ytd_sales_prior") * 100
            )
        )
    )

    return result


In [0]:
family_ytd_growth = calculate_ytd_growth_vs_last_year(
    combined_sales_df,
    group_cols=["family"]
)
family_ytd_growth.show()

In [0]:
store_ytd_growth = calculate_ytd_growth_vs_last_year(
    combined_sales_df,
    group_cols=["store_nbr"]
)
store_ytd_growth.show()