In [12]:
from pyspark.sql import functions as F
from helper import run_forecast, aggregate_to_granularity, build_features, train_test_split

from pyspark.sql import SparkSession

# Give Spark way more memory since you have 32GB RAM available
spark = SparkSession.builder \
    .appName("TimeSeriesForecast") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .config("spark.default.parallelism", "8") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Load data (must include columns: date, sales, family, store_nbr)
df = (   
      spark.read.parquet('../notebooks/data/train.parquet')

      .withColumn("date", F.to_date(F.col("date"))))



In [13]:
df.show(2)

+----+------+------+--------------------+----------+---------+-----+-----------+-------+----------+----------+--------------------+----------------+--------------+---------------+--------------+-----------+------------+-----+----+-------------+
|type| state|  city|              family|      date|store_nbr|sales|onpromotion|cluster|is_holiday|dcoilwtico|       hash_features|strIndxer_family|strIndxer_city|strIndxer_state|strIndxer_type|day_of_week|day_of_month|month|year|is_salary_day|
+----+------+------+--------------------+----------+---------+-----+-----------+-------+----------+----------+--------------------+----------------+--------------+---------------+--------------+-----------+------------+-----+----+-------------+
|   D| Azuay|Cuenca|SCHOOL AND OFFICE...|2013-01-01|       37|  0.0|          0|      2|         1|     93.14|(1024,[124,560,79...|              27|            14|             15|             4|          3|           1|    1|2013|            0|
|   C|Guayas|Playas|

In [14]:
# agg_cols = ['family', 'store_nbr', 'state', 'city', 'type', 'cluster'] # there is one store per city and type and cluster so adding those feature is redundant
# agg_cols = ['family', 'store_nbr'] # there is one store per city and type and cluster so adding those feature is redundant

# # agg_cols = ['family']

# df_agg = aggregate_to_granularity(
#     df=df,
#     date_col='date',
#     target_col='sales',
#     group_cols=agg_cols,
#     agg="sum",  # sales are additive
#     # extra_numeric_aggs={"dcoilwtico": "mean"}  # optional
# )
# print(df_agg.count())
# df_agg.show(2)

# df_feat = build_features(
#     df=df_agg,
#     date_col='date',
#     target_col='sales',
#     group_cols=agg_cols,
#     lags=[1, 7, 14, 28],
#     mas=[7, 28],
#     # add_time_signals=cfg["features"]["add_time_signals"]
# )
# df_feat.show(10)
# print(df_feat.count())
# # df_feat.sort(agg_cols + ["date"]).select(agg_cols + ['date', 'sales', 'dcoilwtico', 'lag_1', 'lag_2']).show(10)

In [None]:
df.sample(0.1)

+----+------+------+--------------------+----------+---------+-----+-----------+-------+----------+----------+--------------------+----------------+--------------+---------------+--------------+-----------+------------+-----+----+-------------+
|type| state|  city|              family|      date|store_nbr|sales|onpromotion|cluster|is_holiday|dcoilwtico|       hash_features|strIndxer_family|strIndxer_city|strIndxer_state|strIndxer_type|day_of_week|day_of_month|month|year|is_salary_day|
+----+------+------+--------------------+----------+---------+-----+-----------+-------+----------+----------+--------------------+----------------+--------------+---------------+--------------+-----------+------------+-----+----+-------------+
|   C|Guayas|Playas|SCHOOL AND OFFICE...|2013-01-02|       35|  0.0|          0|      3|         0|     93.14|(1024,[191,373,47...|              27|             9|              7|             1|          4|           2|    1|2013|            0|
|   D|Manabi| Manta|

In [18]:
# Load config from YAML or dict
import yaml, json
cfg = yaml.safe_load(open("forecast_config.yaml"))

out = run_forecast(df.sample(0.1), cfg)

display(out["predictions"])         # per-group predictions on test window

[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mhelper[39m[38;5;245m.[39m[38;5;247mpy[39m[38;5;245m:[39m[38;5;36m488[39m[38;5;245m [39m[38;5;100min[39m[38;5;245m [39m[38;5;247mrun_forecast[39m[38;5;245m([39m[38;5;245m)[39m[38;5;245m [39m[38;5;247mat[39m[38;5;245m [39m[38;5;36m23[39m[38;5;245m:[39m[38;5;36m49[39m[38;5;245m:[39m[38;5;36m36.106[39m


+----------+---------+----------+-----+-----+-----+------+------+-----------------+-----------------+---+---+----------+-----+----+
|    family|store_nbr|      date|sales|lag_1|lag_7|lag_14|lag_28|             ma_7|            ma_28|dow|dom|weekofyear|month|year|
+----------+---------+----------+-----+-----+-----+------+------+-----------------+-----------------+---+---+----------+-----+----+
|AUTOMOTIVE|        6|2013-12-23|  7.0|  4.0|  6.0|   1.0|   4.0|6.428571428571429|4.357142857142857|  2| 23|        52|   12|2013|
|AUTOMOTIVE|        6|2014-01-17|  3.0|  7.0| 10.0|   2.0|   6.0|5.428571428571429|             4.25|  6| 17|         3|    1|2014|
+----------+---------+----------+-----+-----+-----+------+------+-----------------+-----------------+---+---+----------+-----+----+
only showing top 2 rows


[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;245m)[39m[38;5;245m:[39m[38;5;245m [39m[38;5;100mNone[39m


+----------+---------+----------+-----+-----+-----+------+------+----+-----------------+---+---+----------+-----+----+
|    family|store_nbr|      date|sales|lag_1|lag_7|lag_14|lag_28|ma_7|            ma_28|dow|dom|weekofyear|month|year|
+----------+---------+----------+-----+-----+-----+------+------+----+-----------------+---+---+----------+-----+----+
|AUTOMOTIVE|       10|2013-09-07|  4.0|  3.0|  3.0|   2.0|   3.0| 3.0|             3.25|  7|  7|        36|    9|2013|
|AUTOMOTIVE|       10|2013-10-01|  3.0|  4.0|  3.0|   3.0|   6.0| 3.0|3.142857142857143|  3|  1|        40|   10|2013|
+----------+---------+----------+-----+-----+-----+------+------+----+-----------------+---+---+----------+-----+----+
only showing top 2 rows


[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;100melse[39m[38;5;245m:[39m[38;5;245m:[39m[38;5;245m [39m[38;5;100mNone[39m


+----------+---------+----------+-----+-----+-----+------+------+-----------------+------------------+---+---+----------+-----+----+
|    family|store_nbr|      date|sales|lag_1|lag_7|lag_14|lag_28|             ma_7|             ma_28|dow|dom|weekofyear|month|year|
+----------+---------+----------+-----+-----+-----+------+------+-----------------+------------------+---+---+----------+-----+----+
|AUTOMOTIVE|       10|2016-10-25|  3.0|  2.0|  6.0|   3.0|   4.0|              2.0|3.3214285714285716|  3| 25|        43|   10|2016|
|AUTOMOTIVE|       10|2016-11-02|  3.0|  3.0|  2.0|   1.0|   3.0|2.142857142857143|3.3214285714285716|  4|  2|        44|   11|2016|
+----------+---------+----------+-----+-----+-----+------+------+-----------------+------------------+---+---+----------+-----+----+
only showing top 2 rows


[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mest[39m[38;5;245m [39m[38;5;245m=[39m[38;5;245m [39m[38;5;247mm[39m[38;5;245m:[39m[38;5;245m [39m[38;5;100mNone[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mac[39m[38;5;245m:[39m[38;5;245m [39m[38;5;247mGBTRegressor_57cf7b8a6b6d[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m-[39m[38;5;245m:[39m[38;5;245m [39m[38;5;245m[[39m[38;5;36m'[39m[38;5;36mlag_1[39m[38;5;36m'[39m[38;5;245m,[39m
[38;5;245m                   [39m[38;5;36m'[39m[38;5;36mlag_7[39m[38;5;36m'[39m[38;5;245m,[39m
[38;5;245m                   [39m[38;5;36m'[39m[38;5;36mlag_14[39m[38;5;36m'[39m[38;5;245m,[39m
[38;5;245m                   [39m[38;5;36m'[39m[38;5;36mlag_28[39m

+-----+-----+------+------+----+-----------------+---+---+----------+-----+----+
|lag_1|lag_7|lag_14|lag_28|ma_7|            ma_28|dow|dom|weekofyear|month|year|
+-----+-----+------+------+----+-----------------+---+---+----------+-----+----+
|  3.0|  3.0|   2.0|   3.0| 3.0|             3.25|  7|  7|        36|    9|2013|
|  4.0|  3.0|   3.0|   6.0| 3.0|3.142857142857143|  3|  1|        40|   10|2013|
+-----+-----+------+------+----+-----------------+---+---+----------+-----+----+
only showing top 2 rows


[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mtration[39m[38;5;245m:[39m[38;5;245m [39m[38;5;247mtrain[39m[38;5;245m [39m[38;5;166m→[39m[38;5;245m [39m[38;5;247mforecast[39m[38;5;245m [39m[38;5;166m→[39m[38;5;245m [39m[38;5;247mevalua[39m[38;5;245m:[39m[38;5;245m [39m[38;5;100mNone[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mecast[39m[38;5;245m([39m[38;5;247mdf[39m[38;5;245m:[39m[38;5;245m [39m[38;5;247mDataF[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36msales[39m[38;5;36m'[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;245m[[39m[38;5;36m"[39m[38;5;36mdata[39m[38;5;36m"[39m[38;5;245m][39m
[38;5;245m    [39m[38;5;247ms[39m[38;5;245m:[39m[38;5;245m [39m[38;5;245m[[39m[38;5;36m'[39m[38;5;36mfamily[39m[38;5;36m'[39m[38;5;245m,[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36mstore_nbr[39m[38;5;36m'[39m[38;5;245m][39m
[38;5;247mic[39m[38;5;245m|

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `s` cannot be resolved. Did you mean one of the following? [`y`, `date`, `family`, `store_nbr`, `prediction`]. SQLSTATE: 42703;
'Project [family#1920, store_nbr#1922, date#2091, y#2894, prediction#2890, lag(prediction#2890, -7, null) windowspecdefinition('s, 't, 'o, 'r, 'e, '_, 'n, 'b, 'r, y#2894 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -7, -7)) AS naive#2895]
+- Project [family#1920, store_nbr#1922, date#2091, sales#2068 AS y#2894, prediction#2890]
   +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, prediction#2890]
      +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, family_idx#2847, family_ohe#2855, store_nbr_idx#2865, store_nbr_ohe#2873, features#2886, UDF(features#2886) AS prediction#2890]
         +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, family_idx#2847, family_ohe#2855, store_nbr_idx#2865, store_nbr_ohe#2873, UDF(struct(lag_1, lag_1#2092, lag_7, lag_7#2093, lag_14, lag_14#2094, lag_28, lag_28#2095, ma_7, ma_7#2096, ma_28, ma_28#2098, dow_double_VectorAssembler_c4ff664fe0cc, cast(dow#2100 as double), dom_double_VectorAssembler_c4ff664fe0cc, cast(dom#2101 as double), weekofyear_double_VectorAssembler_c4ff664fe0cc, cast(weekofyear#2102 as double), month_double_VectorAssembler_c4ff664fe0cc, cast(month#2103 as double), year_double_VectorAssembler_c4ff664fe0cc, cast(year#2104 as double), family_ohe, family_ohe#2855, store_nbr_ohe, ... 1 more fields)) AS features#2886]
            +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, family_idx#2847, family_ohe#2855, store_nbr_idx#2865, UDF(cast(store_nbr_idx#2865 as double), 0) AS store_nbr_ohe#2873]
               +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, family_idx#2847, family_ohe#2855, UDF(cast(store_nbr#1922 as string)) AS store_nbr_idx#2865]
                  +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, family_idx#2847, UDF(cast(family_idx#2847 as double), 0) AS family_ohe#2855]
                     +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, UDF(cast(family#1920 as string)) AS family_idx#2847]
                        +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104]
                           +- Filter (rnk#2161 > (max_rnk#2163 - 28))
                              +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, rnk#2161, max_rnk#2163]
                                 +- Join Inner, ((family#1920 = family#2184) AND (store_nbr#1922 = store_nbr#2186))
                                    :- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, rnk#2161]
                                    :  +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, rnk#2161, rnk#2161]
                                    :     +- Window [row_number() windowspecdefinition(family#1920, store_nbr#1922, date#2091 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rnk#2161], [family#1920, store_nbr#1922], [date#2091 ASC NULLS FIRST]
                                    :        +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104]
                                    :           +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104]
                                    :              +- Join Inner, ((family#1920 = family#2127) AND (store_nbr#1922 = store_nbr#2129))
                                    :                 :- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104]
                                    :                 :  +- Filter (_row_num#2105 > 28)
                                    :                 :     +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, _row_num#2105]
                                    :                 :        +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104, _row_num#2105, _row_num#2105]
                                    :                 :           +- Window [row_number() windowspecdefinition(family#1920, store_nbr#1922, date#2091 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _row_num#2105], [family#1920, store_nbr#1922], [date#2091 ASC NULLS FIRST]
                                    :                 :              +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year#2104]
                                    :                 :                 +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month#2103, year(date#2091) AS year#2104]
                                    :                 :                    +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear#2102, month(date#2091) AS month#2103]
                                    :                 :                       +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dom#2101, weekofyear(date#2091) AS weekofyear#2102]
                                    :                 :                          +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dow#2100, dayofmonth(date#2091) AS dom#2101]
                                    :                 :                             +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, dayofweek(date#2091) AS dow#2100]
                                    :                 :                                +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098]
                                    :                 :                                   +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_28#2098, ma_28#2098]
                                    :                 :                                      +- Window [avg(sales#2068) windowspecdefinition(family#1920, store_nbr#1922, date#2091 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -27, currentrow$())) AS ma_28#2098], [family#1920, store_nbr#1922], [date#2091 ASC NULLS FIRST]
                                    :                 :                                         +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096]
                                    :                 :                                            +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096]
                                    :                 :                                               +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, ma_7#2096, ma_7#2096]
                                    :                 :                                                  +- Window [avg(sales#2068) windowspecdefinition(family#1920, store_nbr#1922, date#2091 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -6, currentrow$())) AS ma_7#2096], [family#1920, store_nbr#1922], [date#2091 ASC NULLS FIRST]
                                    :                 :                                                     +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095]
                                    :                 :                                                        +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095]
                                    :                 :                                                           +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_28#2095, lag_28#2095]
                                    :                 :                                                              +- Window [lag(sales#2068, -28, null) windowspecdefinition(family#1920, store_nbr#1922, date#2091 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -28, -28)) AS lag_28#2095], [family#1920, store_nbr#1922], [date#2091 ASC NULLS FIRST]
                                    :                 :                                                                 +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094]
                                    :                 :                                                                    +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094]
                                    :                 :                                                                       +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_14#2094, lag_14#2094]
                                    :                 :                                                                          +- Window [lag(sales#2068, -14, null) windowspecdefinition(family#1920, store_nbr#1922, date#2091 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -14, -14)) AS lag_14#2094], [family#1920, store_nbr#1922], [date#2091 ASC NULLS FIRST]
                                    :                 :                                                                             +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093]
                                    :                 :                                                                                +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093]
                                    :                 :                                                                                   +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_7#2093, lag_7#2093]
                                    :                 :                                                                                      +- Window [lag(sales#2068, -7, null) windowspecdefinition(family#1920, store_nbr#1922, date#2091 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -7, -7)) AS lag_7#2093], [family#1920, store_nbr#1922], [date#2091 ASC NULLS FIRST]
                                    :                 :                                                                                         +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092]
                                    :                 :                                                                                            +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092]
                                    :                 :                                                                                               +- Project [family#1920, store_nbr#1922, date#2091, sales#2068, lag_1#2092, lag_1#2092]
                                    :                 :                                                                                                  +- Window [lag(sales#2068, -1, null) windowspecdefinition(family#1920, store_nbr#1922, date#2091 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS lag_1#2092], [family#1920, store_nbr#1922], [date#2091 ASC NULLS FIRST]
                                    :                 :                                                                                                     +- Project [family#1920, store_nbr#1922, date#2091, sales#2068]
                                    :                 :                                                                                                        +- Project [family#1920, store_nbr#1922, to_date(date#1939, None, Some(America/New_York), true) AS date#2091, sales#2068]
                                    :                 :                                                                                                           +- Aggregate [family#1920, store_nbr#1922, date#1939], [family#1920, store_nbr#1922, date#1939, sum(sales#1923) AS sales#2068]
                                    :                 :                                                                                                              +- Sample 0.0, 0.1, false, 1324807075557759195
                                    :                 :                                                                                                                 +- Project [type#1917, state#1918, city#1919, family#1920, to_date(date#1921, None, Some(America/New_York), true) AS date#1939, store_nbr#1922, sales#1923, onpromotion#1924, cluster#1925, is_holiday#1926, dcoilwtico#1927, hash_features#1928, strIndxer_family#1929L, strIndxer_city#1930L, strIndxer_state#1931L, strIndxer_type#1932L, day_of_week#1933, day_of_month#1934, month#1935, year#1936, is_salary_day#1937]
                                    :                 :                                                                                                                    +- Relation [type#1917,state#1918,city#1919,family#1920,date#1921,store_nbr#1922,sales#1923,onpromotion#1924,cluster#1925,is_holiday#1926,dcoilwtico#1927,hash_features#1928,strIndxer_family#1929L,strIndxer_city#1930L,strIndxer_state#1931L,strIndxer_type#1932L,day_of_week#1933,day_of_month#1934,month#1935,year#1936,is_salary_day#1937] parquet
                                    :                 +- Project [family#2127, store_nbr#2129]
                                    :                    +- Filter (n#2107L >= cast(84 as bigint))
                                    :                       +- Aggregate [family#2127, store_nbr#2129], [family#2127, store_nbr#2129, count(1) AS n#2107L]
                                    :                          +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, dow#2154, dom#2155, weekofyear#2156, month#2157, year#2158]
                                    :                             +- Filter (_row_num#2159 > 28)
                                    :                                +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, dow#2154, dom#2155, weekofyear#2156, month#2157, year#2158, _row_num#2159]
                                    :                                   +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, dow#2154, dom#2155, weekofyear#2156, month#2157, year#2158, _row_num#2159, _row_num#2159]
                                    :                                      +- Window [row_number() windowspecdefinition(family#2127, store_nbr#2129, date#2147 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _row_num#2159], [family#2127, store_nbr#2129], [date#2147 ASC NULLS FIRST]
                                    :                                         +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, dow#2154, dom#2155, weekofyear#2156, month#2157, year#2158]
                                    :                                            +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, dow#2154, dom#2155, weekofyear#2156, month#2157, year(date#2147) AS year#2158]
                                    :                                               +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, dow#2154, dom#2155, weekofyear#2156, month(date#2147) AS month#2157]
                                    :                                                  +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, dow#2154, dom#2155, weekofyear(date#2147) AS weekofyear#2156]
                                    :                                                     +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, dow#2154, dayofmonth(date#2147) AS dom#2155]
                                    :                                                        +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, dayofweek(date#2147) AS dow#2154]
                                    :                                                           +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153]
                                    :                                                              +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_28#2153, ma_28#2153]
                                    :                                                                 +- Window [avg(sales#2146) windowspecdefinition(family#2127, store_nbr#2129, date#2147 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -27, currentrow$())) AS ma_28#2153], [family#2127, store_nbr#2129], [date#2147 ASC NULLS FIRST]
                                    :                                                                    +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152]
                                    :                                                                       +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152]
                                    :                                                                          +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, ma_7#2152, ma_7#2152]
                                    :                                                                             +- Window [avg(sales#2146) windowspecdefinition(family#2127, store_nbr#2129, date#2147 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -6, currentrow$())) AS ma_7#2152], [family#2127, store_nbr#2129], [date#2147 ASC NULLS FIRST]
                                    :                                                                                +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151]
                                    :                                                                                   +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151]
                                    :                                                                                      +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_28#2151, lag_28#2151]
                                    :                                                                                         +- Window [lag(sales#2146, -28, null) windowspecdefinition(family#2127, store_nbr#2129, date#2147 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -28, -28)) AS lag_28#2151], [family#2127, store_nbr#2129], [date#2147 ASC NULLS FIRST]
                                    :                                                                                            +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150]
                                    :                                                                                               +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150]
                                    :                                                                                                  +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_14#2150, lag_14#2150]
                                    :                                                                                                     +- Window [lag(sales#2146, -14, null) windowspecdefinition(family#2127, store_nbr#2129, date#2147 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -14, -14)) AS lag_14#2150], [family#2127, store_nbr#2129], [date#2147 ASC NULLS FIRST]
                                    :                                                                                                        +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149]
                                    :                                                                                                           +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149]
                                    :                                                                                                              +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_7#2149, lag_7#2149]
                                    :                                                                                                                 +- Window [lag(sales#2146, -7, null) windowspecdefinition(family#2127, store_nbr#2129, date#2147 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -7, -7)) AS lag_7#2149], [family#2127, store_nbr#2129], [date#2147 ASC NULLS FIRST]
                                    :                                                                                                                    +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148]
                                    :                                                                                                                       +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148]
                                    :                                                                                                                          +- Project [family#2127, store_nbr#2129, date#2147, sales#2146, lag_1#2148, lag_1#2148]
                                    :                                                                                                                             +- Window [lag(sales#2146, -1, null) windowspecdefinition(family#2127, store_nbr#2129, date#2147 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS lag_1#2148], [family#2127, store_nbr#2129], [date#2147 ASC NULLS FIRST]
                                    :                                                                                                                                +- Project [family#2127, store_nbr#2129, date#2147, sales#2146]
                                    :                                                                                                                                   +- Project [family#2127, store_nbr#2129, to_date(date#2145, None, Some(America/New_York), true) AS date#2147, sales#2146]
                                    :                                                                                                                                      +- Aggregate [family#2127, store_nbr#2129, date#2145], [family#2127, store_nbr#2129, date#2145, sum(sales#2130) AS sales#2146]
                                    :                                                                                                                                         +- Sample 0.0, 0.1, false, 1324807075557759195
                                    :                                                                                                                                            +- Project [type#2124, state#2125, city#2126, family#2127, to_date(date#2128, None, Some(America/New_York), true) AS date#2145, store_nbr#2129, sales#2130, onpromotion#2131, cluster#2132, is_holiday#2133, dcoilwtico#2134, hash_features#2135, strIndxer_family#2136L, strIndxer_city#2137L, strIndxer_state#2138L, strIndxer_type#2139L, day_of_week#2140, day_of_month#2141, month#2142, year#2143, is_salary_day#2144]
                                    :                                                                                                                                               +- Relation [type#2124,state#2125,city#2126,family#2127,date#2128,store_nbr#2129,sales#2130,onpromotion#2131,cluster#2132,is_holiday#2133,dcoilwtico#2134,hash_features#2135,strIndxer_family#2136L,strIndxer_city#2137L,strIndxer_state#2138L,strIndxer_type#2139L,day_of_week#2140,day_of_month#2141,month#2142,year#2143,is_salary_day#2144] parquet
                                    +- Aggregate [family#2184, store_nbr#2186], [family#2184, store_nbr#2186, max(rnk#2254) AS max_rnk#2163]
                                       +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month#2214, year#2215, rnk#2254]
                                          +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month#2214, year#2215, rnk#2254, rnk#2254]
                                             +- Window [row_number() windowspecdefinition(family#2184, store_nbr#2186, date#2204 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rnk#2254], [family#2184, store_nbr#2186], [date#2204 ASC NULLS FIRST]
                                                +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month#2214, year#2215]
                                                   +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month#2214, year#2215]
                                                      +- Join Inner, ((family#2184 = family#2220) AND (store_nbr#2186 = store_nbr#2222))
                                                         :- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month#2214, year#2215]
                                                         :  +- Filter (_row_num#2216 > 28)
                                                         :     +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month#2214, year#2215, _row_num#2216]
                                                         :        +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month#2214, year#2215, _row_num#2216, _row_num#2216]
                                                         :           +- Window [row_number() windowspecdefinition(family#2184, store_nbr#2186, date#2204 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _row_num#2216], [family#2184, store_nbr#2186], [date#2204 ASC NULLS FIRST]
                                                         :              +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month#2214, year#2215]
                                                         :                 +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month#2214, year(date#2204) AS year#2215]
                                                         :                    +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear#2213, month(date#2204) AS month#2214]
                                                         :                       +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dom#2212, weekofyear(date#2204) AS weekofyear#2213]
                                                         :                          +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dow#2211, dayofmonth(date#2204) AS dom#2212]
                                                         :                             +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, dayofweek(date#2204) AS dow#2211]
                                                         :                                +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210]
                                                         :                                   +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_28#2210, ma_28#2210]
                                                         :                                      +- Window [avg(sales#2203) windowspecdefinition(family#2184, store_nbr#2186, date#2204 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -27, currentrow$())) AS ma_28#2210], [family#2184, store_nbr#2186], [date#2204 ASC NULLS FIRST]
                                                         :                                         +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209]
                                                         :                                            +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209]
                                                         :                                               +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, ma_7#2209, ma_7#2209]
                                                         :                                                  +- Window [avg(sales#2203) windowspecdefinition(family#2184, store_nbr#2186, date#2204 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -6, currentrow$())) AS ma_7#2209], [family#2184, store_nbr#2186], [date#2204 ASC NULLS FIRST]
                                                         :                                                     +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208]
                                                         :                                                        +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208]
                                                         :                                                           +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_28#2208, lag_28#2208]
                                                         :                                                              +- Window [lag(sales#2203, -28, null) windowspecdefinition(family#2184, store_nbr#2186, date#2204 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -28, -28)) AS lag_28#2208], [family#2184, store_nbr#2186], [date#2204 ASC NULLS FIRST]
                                                         :                                                                 +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207]
                                                         :                                                                    +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207]
                                                         :                                                                       +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_14#2207, lag_14#2207]
                                                         :                                                                          +- Window [lag(sales#2203, -14, null) windowspecdefinition(family#2184, store_nbr#2186, date#2204 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -14, -14)) AS lag_14#2207], [family#2184, store_nbr#2186], [date#2204 ASC NULLS FIRST]
                                                         :                                                                             +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206]
                                                         :                                                                                +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206]
                                                         :                                                                                   +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_7#2206, lag_7#2206]
                                                         :                                                                                      +- Window [lag(sales#2203, -7, null) windowspecdefinition(family#2184, store_nbr#2186, date#2204 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -7, -7)) AS lag_7#2206], [family#2184, store_nbr#2186], [date#2204 ASC NULLS FIRST]
                                                         :                                                                                         +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205]
                                                         :                                                                                            +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205]
                                                         :                                                                                               +- Project [family#2184, store_nbr#2186, date#2204, sales#2203, lag_1#2205, lag_1#2205]
                                                         :                                                                                                  +- Window [lag(sales#2203, -1, null) windowspecdefinition(family#2184, store_nbr#2186, date#2204 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS lag_1#2205], [family#2184, store_nbr#2186], [date#2204 ASC NULLS FIRST]
                                                         :                                                                                                     +- Project [family#2184, store_nbr#2186, date#2204, sales#2203]
                                                         :                                                                                                        +- Project [family#2184, store_nbr#2186, to_date(date#2202, None, Some(America/New_York), true) AS date#2204, sales#2203]
                                                         :                                                                                                           +- Aggregate [family#2184, store_nbr#2186, date#2202], [family#2184, store_nbr#2186, date#2202, sum(sales#2187) AS sales#2203]
                                                         :                                                                                                              +- Sample 0.0, 0.1, false, 1324807075557759195
                                                         :                                                                                                                 +- Project [type#2181, state#2182, city#2183, family#2184, to_date(date#2185, None, Some(America/New_York), true) AS date#2202, store_nbr#2186, sales#2187, onpromotion#2188, cluster#2189, is_holiday#2190, dcoilwtico#2191, hash_features#2192, strIndxer_family#2193L, strIndxer_city#2194L, strIndxer_state#2195L, strIndxer_type#2196L, day_of_week#2197, day_of_month#2198, month#2199, year#2200, is_salary_day#2201]
                                                         :                                                                                                                    +- Relation [type#2181,state#2182,city#2183,family#2184,date#2185,store_nbr#2186,sales#2187,onpromotion#2188,cluster#2189,is_holiday#2190,dcoilwtico#2191,hash_features#2192,strIndxer_family#2193L,strIndxer_city#2194L,strIndxer_state#2195L,strIndxer_type#2196L,day_of_week#2197,day_of_month#2198,month#2199,year#2200,is_salary_day#2201] parquet
                                                         +- Project [family#2220, store_nbr#2222]
                                                            +- Filter (n#2253L >= cast(84 as bigint))
                                                               +- Aggregate [family#2220, store_nbr#2222], [family#2220, store_nbr#2222, count(1) AS n#2253L]
                                                                  +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, dow#2247, dom#2248, weekofyear#2249, month#2250, year#2251]
                                                                     +- Filter (_row_num#2252 > 28)
                                                                        +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, dow#2247, dom#2248, weekofyear#2249, month#2250, year#2251, _row_num#2252]
                                                                           +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, dow#2247, dom#2248, weekofyear#2249, month#2250, year#2251, _row_num#2252, _row_num#2252]
                                                                              +- Window [row_number() windowspecdefinition(family#2220, store_nbr#2222, date#2240 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS _row_num#2252], [family#2220, store_nbr#2222], [date#2240 ASC NULLS FIRST]
                                                                                 +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, dow#2247, dom#2248, weekofyear#2249, month#2250, year#2251]
                                                                                    +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, dow#2247, dom#2248, weekofyear#2249, month#2250, year(date#2240) AS year#2251]
                                                                                       +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, dow#2247, dom#2248, weekofyear#2249, month(date#2240) AS month#2250]
                                                                                          +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, dow#2247, dom#2248, weekofyear(date#2240) AS weekofyear#2249]
                                                                                             +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, dow#2247, dayofmonth(date#2240) AS dom#2248]
                                                                                                +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, dayofweek(date#2240) AS dow#2247]
                                                                                                   +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246]
                                                                                                      +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_28#2246, ma_28#2246]
                                                                                                         +- Window [avg(sales#2239) windowspecdefinition(family#2220, store_nbr#2222, date#2240 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -27, currentrow$())) AS ma_28#2246], [family#2220, store_nbr#2222], [date#2240 ASC NULLS FIRST]
                                                                                                            +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245]
                                                                                                               +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245]
                                                                                                                  +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, ma_7#2245, ma_7#2245]
                                                                                                                     +- Window [avg(sales#2239) windowspecdefinition(family#2220, store_nbr#2222, date#2240 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -6, currentrow$())) AS ma_7#2245], [family#2220, store_nbr#2222], [date#2240 ASC NULLS FIRST]
                                                                                                                        +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244]
                                                                                                                           +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244]
                                                                                                                              +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_28#2244, lag_28#2244]
                                                                                                                                 +- Window [lag(sales#2239, -28, null) windowspecdefinition(family#2220, store_nbr#2222, date#2240 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -28, -28)) AS lag_28#2244], [family#2220, store_nbr#2222], [date#2240 ASC NULLS FIRST]
                                                                                                                                    +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243]
                                                                                                                                       +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243]
                                                                                                                                          +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_14#2243, lag_14#2243]
                                                                                                                                             +- Window [lag(sales#2239, -14, null) windowspecdefinition(family#2220, store_nbr#2222, date#2240 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -14, -14)) AS lag_14#2243], [family#2220, store_nbr#2222], [date#2240 ASC NULLS FIRST]
                                                                                                                                                +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242]
                                                                                                                                                   +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242]
                                                                                                                                                      +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_7#2242, lag_7#2242]
                                                                                                                                                         +- Window [lag(sales#2239, -7, null) windowspecdefinition(family#2220, store_nbr#2222, date#2240 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -7, -7)) AS lag_7#2242], [family#2220, store_nbr#2222], [date#2240 ASC NULLS FIRST]
                                                                                                                                                            +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241]
                                                                                                                                                               +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241]
                                                                                                                                                                  +- Project [family#2220, store_nbr#2222, date#2240, sales#2239, lag_1#2241, lag_1#2241]
                                                                                                                                                                     +- Window [lag(sales#2239, -1, null) windowspecdefinition(family#2220, store_nbr#2222, date#2240 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS lag_1#2241], [family#2220, store_nbr#2222], [date#2240 ASC NULLS FIRST]
                                                                                                                                                                        +- Project [family#2220, store_nbr#2222, date#2240, sales#2239]
                                                                                                                                                                           +- Project [family#2220, store_nbr#2222, to_date(date#2238, None, Some(America/New_York), true) AS date#2240, sales#2239]
                                                                                                                                                                              +- Aggregate [family#2220, store_nbr#2222, date#2238], [family#2220, store_nbr#2222, date#2238, sum(sales#2223) AS sales#2239]
                                                                                                                                                                                 +- Sample 0.0, 0.1, false, 1324807075557759195
                                                                                                                                                                                    +- Project [type#2217, state#2218, city#2219, family#2220, to_date(date#2221, None, Some(America/New_York), true) AS date#2238, store_nbr#2222, sales#2223, onpromotion#2224, cluster#2225, is_holiday#2226, dcoilwtico#2227, hash_features#2228, strIndxer_family#2229L, strIndxer_city#2230L, strIndxer_state#2231L, strIndxer_type#2232L, day_of_week#2233, day_of_month#2234, month#2235, year#2236, is_salary_day#2237]
                                                                                                                                                                                       +- Relation [type#2217,state#2218,city#2219,family#2220,date#2221,store_nbr#2222,sales#2223,onpromotion#2224,cluster#2225,is_holiday#2226,dcoilwtico#2227,hash_features#2228,strIndxer_family#2229L,strIndxer_city#2230L,strIndxer_state#2231L,strIndxer_type#2232L,day_of_week#2233,day_of_month#2234,month#2235,year#2236,is_salary_day#2237] parquet


In [31]:
# agg_cols = ['family', 'store_nbr']
agg_cols = ['family']

df_agg = aggregate_to_granularity(
    df=df,
    date_col='date',
    target_col='sales',
    group_cols=agg_cols,
    agg="sum",  # sales are additive
    # extra_numeric_aggs={"price": "mean", "promo_spend": "sum"}  # optional
)

df_feat = build_features(
    df=df_agg,
    date_col='date',
    target_col='sales',
    group_cols=agg_cols,
    lags=[1,2],
    mas=[1,2],
    # add_time_signals=cfg["features"]["add_time_signals"]
)
print(df_feat.count())
df_feat.sort(agg_cols + ["date"]).select(agg_cols + ['date', 'sales', 'lag_1', 'lag_2']).show(10)
df_feat.show(10)

55506
+----------+----------+-----+-----+-----+
|    family|      date|sales|lag_1|lag_2|
+----------+----------+-----+-----+-----+
|AUTOMOTIVE|2013-01-03|161.0|255.0|  0.0|
|AUTOMOTIVE|2013-01-04|169.0|161.0|255.0|
|AUTOMOTIVE|2013-01-05|342.0|169.0|161.0|
|AUTOMOTIVE|2013-01-06|360.0|342.0|169.0|
|AUTOMOTIVE|2013-01-07|189.0|360.0|342.0|
|AUTOMOTIVE|2013-01-08|229.0|189.0|360.0|
|AUTOMOTIVE|2013-01-09|164.0|229.0|189.0|
|AUTOMOTIVE|2013-01-10|164.0|164.0|229.0|
|AUTOMOTIVE|2013-01-11|162.0|164.0|164.0|
|AUTOMOTIVE|2013-01-12|280.0|162.0|164.0|
+----------+----------+-----+-----+-----+
only showing top 10 rows
+----------+----------+-----+-----+-----+-----+-----+---+---+----------+-----+----+
|    family|      date|sales|lag_1|lag_2| ma_1| ma_2|dow|dom|weekofyear|month|year|
+----------+----------+-----+-----+-----+-----+-----+---+---+----------+-----+----+
|AUTOMOTIVE|2013-01-03|161.0|255.0|  0.0|161.0|208.0|  5|  3|         1|    1|2013|
|AUTOMOTIVE|2013-01-04|169.0|161.0|255.0|169.

In [36]:
train, test = train_test_split(df=df_feat, date_col='date', group_cols=agg_cols, mode='horizon', test_horizon=28)
train.show(2)
test.show(2)

                                                                                

+----------+---------+----------+-----+----------+-----+-----+----+----+---+---+----------+-----+----+
|    family|store_nbr|      date|sales|dcoilwtico|lag_1|lag_2|ma_1|ma_2|dow|dom|weekofyear|month|year|
+----------+---------+----------+-----+----------+-----+-----+----+----+---+---+----------+-----+----+
|AUTOMOTIVE|       10|2013-01-03|  2.0|     92.97|  3.0|  0.0| 2.0| 2.5|  5|  3|         1|    1|2013|
|AUTOMOTIVE|       10|2013-01-04|  2.0|     93.12|  2.0|  3.0| 2.0| 2.0|  6|  4|         1|    1|2013|
+----------+---------+----------+-----+----------+-----+-----+----+----+---+---+----------+-----+----+
only showing top 2 rows


[Stage 520:>                                                        (0 + 1) / 1]

+----------+---------+----------+-----+----------+-----+-----+----+----+---+---+----------+-----+----+
|    family|store_nbr|      date|sales|dcoilwtico|lag_1|lag_2|ma_1|ma_2|dow|dom|weekofyear|month|year|
+----------+---------+----------+-----+----------+-----+-----+----+----+---+---+----------+-----+----+
|AUTOMOTIVE|       10|2017-07-19|  0.0|      47.1|  0.0|  0.0| 0.0| 0.0|  4| 19|        29|    7|2017|
|AUTOMOTIVE|       10|2017-07-20|  1.0|     46.73|  0.0|  0.0| 1.0| 0.5|  5| 20|        29|    7|2017|
+----------+---------+----------+-----+----------+-----+-----+----+----+---+---+----------+-----+----+
only showing top 2 rows


                                                                                

In [22]:
agg_cols = ['family', 'family']
# agg_cols = ['family']

df_agg = aggregate_to_granularity(
    df=df,
    date_col='date',
    target_col='sales',
    group_cols=agg_cols,
    agg="sum",  # sales are additive
    # extra_numeric_aggs={"price": "mean", "promo_spend": "sum"}  # optional
)

df_feat = build_features(
    df=df_agg,
    date_col='date',
    target_col='sales',
    group_cols=agg_cols,
    lags=[1,2],
    mas=[1,2],
    # add_time_signals=cfg["features"]["add_time_signals"]
)
print(df_feat.count())
df_feat.sort(agg_cols + ["date"]).select(agg_cols + ['date', 'sales', 'lag_1', 'lag_2']).show(10)

55506
+----------+----------+----------+-----+-----+-----+
|    family|    family|      date|sales|lag_1|lag_2|
+----------+----------+----------+-----+-----+-----+
|AUTOMOTIVE|AUTOMOTIVE|2013-01-03|161.0|255.0|  0.0|
|AUTOMOTIVE|AUTOMOTIVE|2013-01-04|169.0|161.0|255.0|
|AUTOMOTIVE|AUTOMOTIVE|2013-01-05|342.0|169.0|161.0|
|AUTOMOTIVE|AUTOMOTIVE|2013-01-06|360.0|342.0|169.0|
|AUTOMOTIVE|AUTOMOTIVE|2013-01-07|189.0|360.0|342.0|
|AUTOMOTIVE|AUTOMOTIVE|2013-01-08|229.0|189.0|360.0|
|AUTOMOTIVE|AUTOMOTIVE|2013-01-09|164.0|229.0|189.0|
|AUTOMOTIVE|AUTOMOTIVE|2013-01-10|164.0|164.0|229.0|
|AUTOMOTIVE|AUTOMOTIVE|2013-01-11|162.0|164.0|164.0|
|AUTOMOTIVE|AUTOMOTIVE|2013-01-12|280.0|162.0|164.0|
+----------+----------+----------+-----+-----+-----+
only showing top 10 rows


In [None]:
dfx = build_features(df, "date", "sales", ["family", "store_nbr"], lags=[1, 2], mas=[1])
print(dfx.count())
dfx.sort(["family", "store_nbr", "date"]).select(['family', 'store_nbr', 'date', 'sales', 'lag_1', 'lag_2']).show(10)

                                                                                

2997324




+----------+---------+----------+-----+-----+-----+
|    family|store_nbr|      date|sales|lag_1|lag_2|
+----------+---------+----------+-----+-----+-----+
|AUTOMOTIVE|        1|2013-01-03|  3.0|  2.0|  0.0|
|AUTOMOTIVE|        1|2013-01-04|  3.0|  3.0|  2.0|
|AUTOMOTIVE|        1|2013-01-05|  5.0|  3.0|  3.0|
|AUTOMOTIVE|        1|2013-01-06|  2.0|  5.0|  3.0|
|AUTOMOTIVE|        1|2013-01-07|  0.0|  2.0|  5.0|
|AUTOMOTIVE|        1|2013-01-08|  2.0|  0.0|  2.0|
|AUTOMOTIVE|        1|2013-01-09|  2.0|  2.0|  0.0|
|AUTOMOTIVE|        1|2013-01-10|  2.0|  2.0|  2.0|
|AUTOMOTIVE|        1|2013-01-11|  3.0|  2.0|  2.0|
|AUTOMOTIVE|        1|2013-01-12|  2.0|  3.0|  2.0|
+----------+---------+----------+-----+-----+-----+
only showing top 10 rows


                                                                                

In [23]:
dfx = build_features(df, "date", "sales", ["family"], lags=[1, 2], mas=[1])
print(dfx.count())
dfx.sort(["family", "store_nbr", "date"]).select(['family', 'store_nbr', 'date', 'sales', 'lag_1', 'lag_2']).show(10)

3000822




+----------+---------+----------+-----+-----+-----+
|    family|store_nbr|      date|sales|lag_1|lag_2|
+----------+---------+----------+-----+-----+-----+
|AUTOMOTIVE|        1|2013-01-01|  0.0|  0.0|  0.0|
|AUTOMOTIVE|        1|2013-01-02|  2.0| 11.0|  0.0|
|AUTOMOTIVE|        1|2013-01-03|  3.0|  0.0|  5.0|
|AUTOMOTIVE|        1|2013-01-04|  3.0|  0.0|  1.0|
|AUTOMOTIVE|        1|2013-01-05|  5.0|  0.0|  6.0|
|AUTOMOTIVE|        1|2013-01-06|  2.0|  7.0|  5.0|
|AUTOMOTIVE|        1|2013-01-07|  0.0|  5.0|  7.0|
|AUTOMOTIVE|        1|2013-01-08|  2.0|  2.0| 14.0|
|AUTOMOTIVE|        1|2013-01-09|  2.0|  7.0|  0.0|
|AUTOMOTIVE|        1|2013-01-10|  2.0| 12.0|  3.0|
+----------+---------+----------+-----+-----+-----+
only showing top 10 rows


                                                                                

In [None]:
# Load config from YAML or dict
import yaml, json
cfg = yaml.safe_load(open("forecast_config.yaml"))

out = run_forecast(df_feat, cfg)

display(out["predictions"])         # per-group predictions on test window
# display(out["metrics_portfolio"])   # wMAPE, sMAPE, MASE overall
# display(out["metrics_by_series"])   # same metrics by series
# display(out["backtest"])            # rolling-origin backtest summary (if enabled)


