# M3 Monthly Data: Load + Initial Cleaning

This notebook loads M3 monthly actual values (`TSTS`) and forecaster values (`FTS`) using the project loading module, then performs a minimal first-pass cleaning scaffold.

In [1]:
import sys
from pathlib import Path

import pandas as pd

PROJECT_ROOT = Path.cwd().resolve().parent if Path.cwd().name == 'analyses' else Path.cwd().resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.data.loading import (
    DEFAULT_M3_QUARTERLY_ACTUALS_CSV_PATH,
    DEFAULT_M3_QUARTERLY_FORECASTS_CSV_PATH,
    load_or_download_m3_quarterly_actuals,
    load_or_download_m3_quarterly_forecasts,
)
from src.data.cleaning import (
    align_m3_monthly_actuals_and_forecasts,
    build_m3_series_horizon_matrix,
    prepare_m3_monthly_data,
)

# Monthly aliases for readability (loader names remain backward-compatible).
M3_MONTHLY_ACTUALS_CSV_PATH = DEFAULT_M3_QUARTERLY_ACTUALS_CSV_PATH
M3_MONTHLY_FORECASTS_CSV_PATH = DEFAULT_M3_QUARTERLY_FORECASTS_CSV_PATH


In [2]:
actuals_raw = load_or_download_m3_quarterly_actuals()
forecasts_raw = load_or_download_m3_quarterly_forecasts()

print('Cached actuals path:', M3_MONTHLY_ACTUALS_CSV_PATH)
print('Cached forecasts path:', M3_MONTHLY_FORECASTS_CSV_PATH)
print('actuals_raw shape:', actuals_raw.shape)
print('forecasts_raw shape:', forecasts_raw.shape)

display(actuals_raw.head())
display(forecasts_raw.head())


Cached actuals path: /home/clayt/Ensemble-Forecasting/data/M3_monthly_TSTS.csv
Cached forecasts path: /home/clayt/Ensemble-Forecasting/data/M3_monthly_FTS.csv
actuals_raw shape: (167562, 4)
forecasts_raw shape: (616896, 6)


Unnamed: 0,series_id,category,value,timestamp
0,M1,MICRO,2640.0,1990-01
1,M1,MICRO,2640.0,1990-02
2,M1,MICRO,2160.0,1990-03
3,M1,MICRO,4200.0,1990-04
4,M1,MICRO,3360.0,1990-05


Unnamed: 0,series_id,method_id,forecast,horizon,timestamp,origin_timestamp
0,M1,NAIVE2,2400.0,1,1994-03,1994-02
1,M1,NAIVE2,2400.0,2,1994-04,1994-02
2,M1,NAIVE2,2400.0,3,1994-05,1994-02
3,M1,NAIVE2,2400.0,4,1994-06,1994-02
4,M1,NAIVE2,2400.0,5,1994-07,1994-02


In [3]:
# Keep only macro series and align actuals with forecasts by horizon
actuals_macro, forecasts_macro, macro_series_ids = prepare_m3_monthly_data(
    actuals_raw, forecasts_raw, category='MACRO'
)

aligned_long = align_m3_monthly_actuals_and_forecasts(actuals_macro, forecasts_macro)

print('Macro series count:', len(macro_series_ids))
print('actuals_macro shape:', actuals_macro.shape)
print('forecasts_macro shape:', forecasts_macro.shape)
print('aligned_long shape:', aligned_long.shape)
print('Horizon consistency share:', aligned_long['horizon_consistent'].mean())

display(actuals_macro.head())
display(forecasts_macro.head())
display(aligned_long.head())


Macro series count: 312
actuals_macro shape: (40835, 6)
forecasts_macro shape: (134784, 8)
aligned_long shape: (134784, 9)
Horizon consistency share: 1.0


Unnamed: 0,series_id,category,value,timestamp,actual,period
115764,M1000,MACRO,3705.4,1983-01,3705.4,1983-01
115765,M1000,MACRO,3726.0,1983-02,3726.0,1983-02
115766,M1000,MACRO,3692.0,1983-03,3692.0,1983-03
115767,M1000,MACRO,3721.6,1983-04,3721.6,1983-04
115768,M1000,MACRO,3681.0,1983-05,3681.0,1983-05


Unnamed: 0,series_id,method_id,forecast,horizon,timestamp,origin_timestamp,target_period,origin_period
431964,M1000,AAM1,4556.58,1,1992-09,1992-08,1992-09,1992-08
431965,M1000,AAM1,4571.48,2,1992-10,1992-08,1992-10,1992-08
431966,M1000,AAM1,4577.8,3,1992-11,1992-08,1992-11,1992-08
431967,M1000,AAM1,4586.75,4,1992-12,1992-08,1992-12,1992-08
431968,M1000,AAM1,4594.23,5,1993-01,1992-08,1993-01,1992-08


Unnamed: 0,series_id,method_id,horizon,origin_period,target_period,expected_target_period,horizon_consistent,forecast,actual
0,M1000,AAM1,1,1992-08,1992-09,1992-09,True,4556.58,4580.6
1,M1000,AAM1,2,1992-08,1992-10,1992-10,True,4571.48,4563.4
2,M1000,AAM1,3,1992-08,1992-11,1992-11,True,4577.8,4551.8
3,M1000,AAM1,4,1992-08,1992-12,1992-12,True,4586.75,4577.4
4,M1000,AAM1,5,1992-08,1993-01,1993-01,True,4594.23,4592.4


In [4]:
# Build an ensemble-ready matrix for one macro series and one horizon
sample_series_id = str(macro_series_ids[0])
sample_horizon = int(aligned_long['horizon'].dropna().astype(int).min())

series_h_matrix = build_m3_series_horizon_matrix(
    aligned_df=aligned_long,
    series_id=sample_series_id,
    horizon=sample_horizon,
    require_actual=True,
)

print('sample_series_id:', sample_series_id)
print('sample_horizon:', sample_horizon)
print('series_h_matrix shape:', series_h_matrix.shape)
display(series_h_matrix.head())

# This matrix is now ready for ensemble experiments:
# - target: series_h_matrix['actual']
# - experts: method columns (all columns except origin_period, target_period, actual)


sample_series_id: M1000
sample_horizon: 1
series_h_matrix shape: (1, 27)


Unnamed: 0,origin_period,actual,target_period,AAM1,AAM2,ARARMA,Auto-ANN,AutoBox1,AutoBox2,AutoBox3,...,HOLT,NAIVE2,PP-Autocast,RBF,ROBUST-Trend,SINGLE,SMARTFCS,THETA,THETAsm,WINTER
0,1992-08,4580.6,1992-09,4556.58,4556.98,4564.44,4538.85,4566.15,4563.62,4564.34,...,4559.17,4554.6,4552.5,4535.56,4565.69,4554.6,4554.6,4560.43,4548.91,4559.17
