# Module 1.09: Diagnostics — The Big Picture 

> **Goal:** Compute time series diagnostics and understand what patterns exist in our portfolio.

**5Q Lens:** Q4 (Data & Drivers) — Measure structure & chaos across portfolio


---

## 1. Setup

In [1]:
# --- Imports ---
import sys
import os
from pathlib import Path
import warnings
import pandas as pd
from tsforge.eda.ts_features_extension import permutation_entropy,MI_top_k_lags,ADI
from tsfeatures import tsfeatures,lumpiness,stl_features,statistics, series_length
from tsforge.plots import plot_bar, plot_distribution

# Settings
warnings.filterwarnings('ignore')


# --- Settings ---

# Project Root Setup
markers = ('.git', 'pyproject.toml', '.project-root')
p = Path.cwd().resolve()
PROJECT_ROOT = next((d for d in [p] + list(p.parents) if any((d / m).exists() for m in markers)), p)
os.chdir(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Set Directories
DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = DATA_DIR / 'output'

---
## 2. Load Data

In [2]:
# Load dataset from 1.08
weekly_df = pd.read_parquet(OUTPUT_DIR / '1.08_data_preparation_output.parquet')
weekly_df

Unnamed: 0,unique_id,ds,y,is_gap,item_id,store_id,dept_id,cat_id,state_id,wm_yr_wk,...,year,snap_CA,snap_TX,snap_WI,event_name_1,event_name_2,event_name_3,event_type_1,event_type_2,event_type_3
0,FOODS_1_001_CA_1,2011-01-23,3.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11101,...,2011,0,0,0,,,,,,
1,FOODS_1_001_CA_1,2011-01-30,9.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11101,...,2011,1,1,1,,,,,,
2,FOODS_1_001_CA_1,2011-02-06,7.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11102,...,2011,1,1,1,SuperBowl,,,Sporting,,
3,FOODS_1_001_CA_1,2011-02-13,8.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11103,...,2011,0,1,1,ValentinesDay,,,Cultural,,
4,FOODS_1_001_CA_1,2011-02-20,14.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11104,...,2011,0,0,0,PresidentsDay,,,National,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6848882,HOUSEHOLD_2_516_WI_3,2016-05-22,0.0,1,HOUSEHOLD_2_516,WI_3,HOUSEHOLD_2,HOUSEHOLD,WI,11617,...,2016,0,0,0,,,,,,
6848883,HOUSEHOLD_2_516_WI_3,2016-05-29,0.0,1,HOUSEHOLD_2_516,WI_3,HOUSEHOLD_2,HOUSEHOLD,WI,11618,...,2016,1,1,1,MemorialDay,NBAFinalsStart,,National,Sporting,
6848884,HOUSEHOLD_2_516_WI_3,2016-06-05,0.0,0,HOUSEHOLD_2_516,WI_3,HOUSEHOLD_2,HOUSEHOLD,WI,11619,...,2016,1,1,1,Ramadan starts,,,Religious,,
6848885,HOUSEHOLD_2_516_WI_3,2016-06-12,3.0,0,HOUSEHOLD_2_516,WI_3,HOUSEHOLD_2,HOUSEHOLD,WI,11620,...,2016,0,1,1,,,,,,


In [3]:
# Quick sanity check
weekly_df.head(3)

Unnamed: 0,unique_id,ds,y,is_gap,item_id,store_id,dept_id,cat_id,state_id,wm_yr_wk,...,year,snap_CA,snap_TX,snap_WI,event_name_1,event_name_2,event_name_3,event_type_1,event_type_2,event_type_3
0,FOODS_1_001_CA_1,2011-01-23,3.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11101,...,2011,0,0,0,,,,,,
1,FOODS_1_001_CA_1,2011-01-30,9.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11101,...,2011,1,1,1,,,,,,
2,FOODS_1_001_CA_1,2011-02-06,7.0,0,FOODS_1_001,CA_1,FOODS_1,FOODS,CA,11102,...,2011,1,1,1,SuperBowl,,,Sporting,,


---

## 3. Compute Diagnostics

### 3.1 Calculate diagnostics from `tsfeatures`

`tsfeatures` extracts dozens of time series characteristics automatically — this is our first systematic look at the portfolio.

In [4]:
diagnostics = tsfeatures(

    ts = weekly_df,
    # frequency of data is weekly, so here we input 52     
    freq=52,
    features=[
        statistics,
        lumpiness, # variance of variances 
        permutation_entropy, # permutation entropy 
        MI_top_k_lags, # sum of MI over top 5 lags 
        stl_features, # STL decomposition Features (Trend, Seasonal Strength)
        ADI, # Avg Demand Interval
        series_length # number of observations
        ],

        scale=False # ENSURE YOU TURN THIS OFF for accurate statistics, otherwise outputs are standard scaled for model training.. 
)

### 3.3 Merge Hierarchy Metadata

Attach business dimensions so we can slice diagnostics by department, category, store.

In [5]:
# Get hierarchy from original data
hierarchy = (
    weekly_df[['unique_id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']]
    .drop_duplicates(subset=['unique_id'])
)

# Merge
diagnostics = diagnostics.merge(hierarchy, on='unique_id', how='left')

### 3.4 Preview Key Metrics

In [6]:
KEY_METRICS = ['trend', 'seasonal_strength', 'permutation_entropy', 'adi', 'lumpiness', 'MI_top_k_lags']

diagnostics[['unique_id', 'cat_id', 'dept_id'] + KEY_METRICS].head(10)

Unnamed: 0,unique_id,cat_id,dept_id,trend,seasonal_strength,permutation_entropy,adi,lumpiness,MI_top_k_lags
0,FOODS_1_001_CA_1,FOODS,FOODS_1,0.20445,0.376623,0.969347,1.105469,87.235596,0.270401
1,FOODS_1_001_CA_2,FOODS,FOODS_1,0.22328,0.439298,0.981118,1.105469,230.382385,0.153054
2,FOODS_1_001_CA_3,FOODS,FOODS_1,0.162804,0.384099,0.984305,1.118577,116.775986,0.150131
3,FOODS_1_001_CA_4,FOODS,FOODS_1,0.110839,0.479389,0.952965,1.276018,0.956493,0.28407
4,FOODS_1_001_TX_1,FOODS,FOODS_1,0.260977,0.376637,0.962183,1.200855,20.594612,0.168827
5,FOODS_1_001_TX_2,FOODS,FOODS_1,0.212788,0.397528,0.984763,1.156379,17.905836,0.210735
6,FOODS_1_001_TX_3,FOODS,FOODS_1,0.048173,0.417552,0.943629,1.181435,1.983176,0.243874
7,FOODS_1_001_WI_1,FOODS,FOODS_1,0.287522,0.423122,0.975867,1.119522,17.10169,0.251459
8,FOODS_1_001_WI_2,FOODS,FOODS_1,0.392052,0.434856,0.935963,1.288991,30.988007,0.268583
9,FOODS_1_001_WI_3,FOODS,FOODS_1,0.466792,0.466487,0.904182,1.524324,20.387428,0.246044


---

## 4. Save Output

In [7]:
# Save diagnostics for downstream modules
diagnostics.to_parquet(OUTPUT_DIR / '1.09_diagnostics.parquet', index=False)