# Module 1.08: Preparing Data for Forecasting - Timeline Engineering

> **Goal:** Fill gaps, impute missing values, merge calendar → forecast-ready dataset.

---
## 1. Setup

In [1]:
# --- Imports ---
import sys
import os
from pathlib import Path
import pandas as pd
from utilsforecast.preprocessing import fill_gaps
from dtype_diet import optimize_dtypes, report_on_dataframe
import forecast_foundations as ff
import tsforge as tsf

# --- Settings ---

# Project Root Setup
markers = ('.git', 'pyproject.toml', '.project-root')
p = Path.cwd().resolve()
PROJECT_ROOT = next((d for d in [p] + list(p.parents) if any((d / m).exists() for m in markers)), p)
os.chdir(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Data Directory
DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = DATA_DIR / 'output'

---
## 2. Load Data

### 2.1 Load from 1.06

In [2]:
# What did we get from first contact?
weekly_sales = pd.read_parquet(OUTPUT_DIR / '1.06_first_contact_output.parquet')
weekly_sales

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,unique_id,ds,y
0,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_1_001_CA_1,2013-07-14,1.0
1,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_1_001_CA_1,2013-07-28,2.0
2,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_1_001_CA_1,2013-08-04,2.0
3,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_1_001_CA_1,2013-08-11,6.0
4,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,HOBBIES_1_001_CA_1,2013-08-18,1.0
...,...,...,...,...,...,...,...,...
6312400,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,FOODS_3_827_WI_3,2016-05-22,9.0
6312401,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,FOODS_3_827_WI_3,2016-05-29,11.0
6312402,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,FOODS_3_827_WI_3,2016-06-05,10.0
6312403,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,FOODS_3_827_WI_3,2016-06-12,19.0


### 2.2 Preserve metadata

`fill_gaps` drops non-core columns. Store hierarchy now, rejoin later.

In [3]:
# What columns will we lose?
hierarchy_cols = ['item_id', 'store_id', 'dept_id', 'cat_id', 'state_id']
hierarchy_cols = [c for c in hierarchy_cols if c in weekly_sales.columns]

hierarchy_df = weekly_sales[['unique_id'] + hierarchy_cols].drop_duplicates(subset=['unique_id'])
hierarchy_df.head()

Unnamed: 0,unique_id,item_id,store_id,dept_id,cat_id,state_id
0,HOBBIES_1_001_CA_1,HOBBIES_1_001,CA_1,HOBBIES_1,HOBBIES,CA
151,HOBBIES_1_001_CA_2,HOBBIES_1_001,CA_2,HOBBIES_1,HOBBIES,CA
301,HOBBIES_1_001_CA_3,HOBBIES_1_001,CA_3,HOBBIES_1,HOBBIES,CA
451,HOBBIES_1_001_CA_4,HOBBIES_1_001,CA_4,HOBBIES_1,HOBBIES,CA
604,HOBBIES_1_001_TX_1,HOBBIES_1_001,TX_1,HOBBIES_1,HOBBIES,TX


---

<div style="text-align: center;">

## 3. `Q3: Cadence` — Defines the Time Grid

<div style="background: linear-gradient(135deg, #1d1f56 0%, #2d42a7 100%); color: white; padding: 12px 20px; border-radius: 8px; margin: 10px auto; max-width: 600px;">
<strong>Are the time intervals regular and complete?</strong><br>
<em>Gaps break lag features and corrupt rolling calculations.</em>
</div>

</div>

### 3.1 Diagnose gaps

In [4]:
# How many series have gaps?

date_diag = tsf.datetime_diagnostics(
    df=weekly_sales,
    id_col="unique_id",
    date_col="ds",
    target_col="y",
)

n_series_with_gaps = (date_diag['n_gaps'] > 0).sum()
n_series_with_gaps

0

### 3.2 Detect frequency

In [5]:
# What frequency should we fill at?
freq = pd.infer_freq(weekly_sales['ds'].drop_duplicates().sort_values())
freq

'W-SUN'

### 3.3 Fill gaps

In [6]:
# before fill gaps
len(weekly_sales)

6312405

In [7]:
weekly_filled = fill_gaps(
    weekly_sales[['unique_id', 'ds', 'y']],
    freq=freq
)
# after filling gaps
len(weekly_filled)

6848887

In [8]:
weekly_filled = weekly_filled.sort_values(['unique_id', 'ds']).reset_index(drop=True)

In [9]:
# Flag gaps before imputation (for traceability)
weekly_filled['is_gap'] = weekly_filled['y'].isna().astype(int)
n_gaps = weekly_filled['is_gap'].sum()

### 3.4 Restore metadata

In [10]:
# Rejoin hierarchy columns
weekly_filled = weekly_filled.merge(hierarchy_df, on='unique_id', how='left')
weekly_filled.head()

Unnamed: 0,unique_id,ds,y,is_gap,item_id,store_id,dept_id,cat_id,state_id
0,FOODS_1_001_CA_1,2011-01-23,3.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA
1,FOODS_1_001_CA_1,2011-01-30,9.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA
2,FOODS_1_001_CA_1,2011-02-06,7.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA
3,FOODS_1_001_CA_1,2011-02-13,8.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA
4,FOODS_1_001_CA_1,2011-02-20,14.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA


---

<div style="text-align: center;">

## 4. `Q1: Target` — Defines What We're Predicting

<div style="background: linear-gradient(135deg, #2d42a7 0%, #3a2f7e 100%); color: white; padding: 12px 20px; border-radius: 8px; margin: 10px auto; max-width: 600px;">
<strong>How do we treat missing target values?</strong><br>
<em>Imputation strategy depends on domain knowledge and business context.</em>
</div>

</div>

Now that we have a complete time grid with `y = NaN` for gap rows, we need to decide **how to fill those NaNs**. This is a business decision, not a technical one.

### 4.1 Check remaining NAs

In [11]:
# How many NAs do we need to fill?
weekly_filled['y'].isna().sum()

536482

### 4.2 Impute missing values

See slides for imputation strategies. For retail: missing weeks typically mean zero sales.

In [12]:
# Apply zero fill
weekly_filled['y'] = weekly_filled['y'].fillna(0)

In [13]:
# Verify no NAs remain
weekly_filled['y'].isna().sum()

0

---

<div style="text-align: center;">

## 5. `Q4: Data` — Defines What the Model Learns

<div style="background: linear-gradient(135deg, #1d1f56 0%, #2d42a7 100%); color: white; padding: 12px 20px; border-radius: 8px; margin: 10px auto; max-width: 600px;">
<strong>What features can we safely add without leakage?</strong><br>
<em>Calendar features are known-at-time — safe for any forecast date.</em>
</div>

</div>

The `calendar.csv` file is **daily**, but our sales are **weekly**. We need to:
1. Determine week alignment (start vs end)
2. Create a matching week column in calendar
3. Aggregate daily features to weekly

### 5.1 Load calendar

In [14]:
calendar = pd.read_csv(DATA_DIR / 'm5/datasets/calendar.csv')
calendar['date'] = pd.to_datetime(calendar['date'])

calendar.shape

(1969, 13)

In [15]:
calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,,,,,1,0,1


### 5.2 Align daily to weekly

In [16]:
# How do we match calendar dates to our ds?
# Our ds is week-start (Sunday) — Walmart fiscal week runs Sun-Sat
calendar['week_start'] = calendar['date'] - pd.to_timedelta(
    (calendar['date'].dt.dayofweek + 1) % 7, unit='D'
)

calendar[['date', 'week_start', 'weekday']].head(14)

Unnamed: 0,date,week_start,weekday
0,2011-01-29,2011-01-23,Saturday
1,2011-01-30,2011-01-30,Sunday
2,2011-01-31,2011-01-30,Monday
3,2011-02-01,2011-01-30,Tuesday
4,2011-02-02,2011-01-30,Wednesday
5,2011-02-03,2011-01-30,Thursday
6,2011-02-04,2011-01-30,Friday
7,2011-02-05,2011-01-30,Saturday
8,2011-02-06,2011-02-06,Sunday
9,2011-02-07,2011-02-06,Monday


### 5.3 Aggregate calendar

In [17]:
# How do we roll up daily features to weekly?
weekly_calendar = ff.aggregate_calendar_to_weekly(calendar)
weekly_calendar.head()

Unnamed: 0,ds,wm_yr_wk,month,year,snap_CA,snap_TX,snap_WI,event_name_1,event_name_2,event_name_3,event_type_1,event_type_2,event_type_3
0,2011-01-23,11101,1,2011,0,0,0,,,,,,
1,2011-01-30,11101,1,2011,1,1,1,,,,,,
2,2011-02-06,11102,2,2011,1,1,1,SuperBowl,,,Sporting,,
3,2011-02-13,11103,2,2011,0,1,1,ValentinesDay,,,Cultural,,
4,2011-02-20,11104,2,2011,0,0,0,PresidentsDay,,,National,,


### 5.4 Merge into sales

In [18]:
# before merge
weekly_filled.shape

(6848887, 9)

In [19]:
# merge sales with calendar on ds col
weekly_df = weekly_filled.merge(weekly_calendar, on='ds', how='left')

In [20]:
# Did the join work?
weekly_df.shape

(6848887, 21)

In [21]:
weekly_df.head()

Unnamed: 0,unique_id,ds,y,is_gap,item_id,store_id,dept_id,cat_id,state_id,wm_yr_wk,...,year,snap_CA,snap_TX,snap_WI,event_name_1,event_name_2,event_name_3,event_type_1,event_type_2,event_type_3
0,FOODS_1_001_CA_1,2011-01-23,3.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11101,...,2011,0,0,0,,,,,,
1,FOODS_1_001_CA_1,2011-01-30,9.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11101,...,2011,1,1,1,,,,,,
2,FOODS_1_001_CA_1,2011-02-06,7.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11102,...,2011,1,1,1,SuperBowl,,,Sporting,,
3,FOODS_1_001_CA_1,2011-02-13,8.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11103,...,2011,0,1,1,ValentinesDay,,,Cultural,,
4,FOODS_1_001_CA_1,2011-02-20,14.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11104,...,2011,0,0,0,PresidentsDay,,,National,,


In [22]:
# Final column check
weekly_df.columns

Index(['unique_id', 'ds', 'y', 'is_gap', 'item_id', 'store_id', 'dept_id',
       'cat_id', 'state_id', 'wm_yr_wk', 'month', 'year', 'snap_CA', 'snap_TX',
       'snap_WI', 'event_name_1', 'event_name_2', 'event_name_3',
       'event_type_1', 'event_type_2', 'event_type_3'],
      dtype='object')

### 5.5 Optimize dtypes

In [23]:
weekly_df.memory_usage(deep=True).sum() / 1e6

1765.16698

In [24]:
## Downcast dtypes after merging to reduce memory usage
weekly_df = optimize_dtypes(weekly_df, report_on_dataframe(weekly_df))

In [25]:
weekly_df.memory_usage(deep=True).sum() / 1e6

263.611824

## 6. Save Output

In [None]:
weekly_df.to_parquet(OUTPUT_DIR / '1.08_data_preparation_output.parquet', index=False)

: 