<div style="font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; border: 1px solid #ddd; border-radius: 10px; padding: 16px 20px; margin-bottom: 16px; background: #fafafa;">
  <h1 style="margin-top: 0; margin-bottom: 8px; font-size: 24px;">HAVI – 02_data_preparation</h1>
  <p style="margin: 0 0 12px 0; font-size: 14px;">
    Cel: wczytać zbiór <code>master_raw</code>, usunąć duplikaty, obsłużyć tydzień 53 i przygotować tabelę <code>master_clean</code>.
  </p>
</div>


# HAVI – 02_series_preparation

## Cel:
- Wczytać data/master_raw.parquet (produkt etapu 01)
- Zdefiniować serię do prognozowania i poziom agregacji:
  Level A (bazowy): country + sku (agregacja po DC, sum(demand_raw))
- Policzyć metryki jakości serii: ciągłość czasu, luki, segmenty
- Zbudować dataset Level A oraz metryki pod późniejszą selekcję i modele

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_rows", 30)
pd.set_option("display.max_columns", None)

BASE_DIR = Path(".")
DATA_DIR = BASE_DIR / "data"
OUT_DIR = DATA_DIR / "prepared"
OUT_DIR.mkdir(exist_ok=True, parents=True)

RAW_PATH = DATA_DIR / "master_raw.parquet"
RAW_PATH


WindowsPath('data/master_raw.parquet')

## 1. Wczytanie danych

In [2]:
df_raw = pd.read_parquet(RAW_PATH)

for c in ["country", "dc_id", "sku"]:
    df_raw[c] = df_raw[c].astype(str).str.strip()

df_raw["week_start"] = pd.to_datetime(df_raw["week_start"], errors="coerce")
df_raw["demand_raw"] = pd.to_numeric(df_raw["demand_raw"], errors="coerce")

display(df_raw.head())
df_raw.info()


Unnamed: 0,country,dc_id,sku,product_name,year,week,week_start,demand_raw,source_file,source_sheet
0,Germany,100,00004-807-019,Pommes Frites I,2023,21,2023-05-22,1.0,Germany 4-807-019.xlsx,Export
1,Germany,100,00004-807-019,Pommes Frites I,2024,47,2024-11-18,1.0,Germany 4-807-019.xlsx,Export
2,Germany,100,00004-807-019,Pommes Frites I,2024,50,2024-12-09,1.0,Germany 4-807-019.xlsx,Export
3,Germany,100,00004-807-019,Pommes Frites I,2025,24,2025-06-09,1.0,Germany 4-807-019.xlsx,Export
4,Germany,200,00004-807-019,Pommes Frites I,2024,36,2024-09-02,1431.0,Germany 4-807-019.xlsx,Export


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18967 entries, 0 to 18966
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   country       18967 non-null  object        
 1   dc_id         18967 non-null  object        
 2   sku           18967 non-null  object        
 3   product_name  18967 non-null  object        
 4   year          18967 non-null  Int64         
 5   week          18967 non-null  Int64         
 6   week_start    18967 non-null  datetime64[ns]
 7   demand_raw    18967 non-null  float64       
 8   source_file   18967 non-null  object        
 9   source_sheet  18967 non-null  object        
dtypes: Int64(2), datetime64[ns](1), float64(1), object(6)
memory usage: 1.5+ MB


In [3]:
key_cols = ["country", "dc_id", "sku", "week_start", "demand_raw"]
na_counts = df_raw[key_cols].isna().sum()
display(na_counts)

assert na_counts.sum() == 0, "Są braki w kluczowych polach — wróć do 01/QC."
assert (df_raw["demand_raw"] < 0).sum() == 0, "Wykryto ujemne wolumeny — to powinno być naprawione w 01."
assert (df_raw["week_start"].dt.dayofweek != 0).sum() == 0, "week_start nie jest poniedziałkiem dla części rekordów."

print("Smoke testy OK.")


country       0
dc_id         0
sku           0
week_start    0
demand_raw    0
dtype: int64

Smoke testy OK.


## 2. Definicja serii i poziom agregacji

In [4]:
LEVEL_A_KEY = ["country", "sku"]
TIME_COL = "week_start"
VALUE_COL = "demand_raw"


## 3. Budowa datasetu

In [5]:
def mode_or_first(s: pd.Series):
    s = s.dropna().astype(str)
    if s.empty:
        return pd.NA
    vc = s.value_counts()
    return vc.index[0]

df_level_a = (
    df_raw
    .groupby(LEVEL_A_KEY + [TIME_COL], as_index=False)
    .agg(
        demand=(VALUE_COL, "sum"),
        product_name=("product_name", mode_or_first),
        n_dc=("dc_id", "nunique"),
    )
    .sort_values(LEVEL_A_KEY + [TIME_COL])
    .reset_index(drop=True)
)

display(df_level_a.head(10))
df_level_a.shape


Unnamed: 0,country,sku,week_start,demand,product_name,n_dc
0,Germany,00004-807-019,2022-01-03,7233.0,Pommes Frites I,1
1,Germany,00004-807-019,2022-01-10,5271.0,Pommes Frites I,1
2,Germany,00004-807-019,2022-01-17,5462.0,Pommes Frites I,1
3,Germany,00004-807-019,2022-01-24,6225.0,Pommes Frites I,1
4,Germany,00004-807-019,2022-01-31,5095.0,Pommes Frites I,2
5,Germany,00004-807-019,2022-02-07,4765.0,Pommes Frites I,2
6,Germany,00004-807-019,2022-02-14,6608.0,Pommes Frites I,1
7,Germany,00004-807-019,2022-02-21,5466.0,Pommes Frites I,1
8,Germany,00004-807-019,2022-02-28,4868.0,Pommes Frites I,1
9,Germany,00004-807-019,2022-03-07,36.0,Pommes Frites I,1


(5810, 6)

In [6]:
summary_a = pd.Series({
    "rows": len(df_level_a),
    "n_countries": df_level_a["country"].nunique(),
    "n_sku": df_level_a["sku"].nunique(),
    "n_series(country+sku)": df_level_a[LEVEL_A_KEY].drop_duplicates().shape[0],
    "min_date": df_level_a[TIME_COL].min(),
    "max_date": df_level_a[TIME_COL].max(),
})
summary_a



rows                                    5810
n_countries                                6
n_sku                                     22
n_series(country+sku)                     22
min_date                 2018-12-31 00:00:00
max_date                 2025-11-03 00:00:00
dtype: object

## 4. Metryki ciągłości czasu

In [7]:
def continuity_metrics(dates: pd.Series) -> pd.Series:
    d = pd.to_datetime(dates).dropna().sort_values().unique()
    if len(d) == 0:
        return pd.Series({
            "n_weeks_obs": 0,
            "span_weeks": 0,
            "missing_weeks": 0,
            "longest_gap_weeks": 0,
            "n_segments": 0,
        })

    full = pd.date_range(start=d.min(), end=d.max(), freq="W-MON")
    obs = pd.DatetimeIndex(d)

    n_weeks_obs = len(obs)
    span_weeks = len(full)
    missing_weeks = int(span_weeks - n_weeks_obs)

    diffs = obs.to_series().diff().dt.days.div(7).fillna(1).astype(int)
    longest_gap = int(diffs.max() - 1) 

    n_segments = int((diffs > 1).sum() + 1)

    return pd.Series({
        "n_weeks_obs": int(n_weeks_obs),
        "span_weeks": int(span_weeks),
        "missing_weeks": int(missing_weeks),
        "longest_gap_weeks": int(max(longest_gap, 0)),
        "n_segments": int(n_segments),
    })

series_cont = (
    df_level_a
    .groupby(LEVEL_A_KEY, observed=True)[TIME_COL]
    .apply(continuity_metrics)
    .reset_index()
)

display(series_cont.head(10))
series_cont.describe(include="all")


Unnamed: 0,country,sku,level_2,week_start
0,Germany,00004-807-019,n_weeks_obs,188
1,Germany,00004-807-019,span_weeks,201
2,Germany,00004-807-019,missing_weeks,13
3,Germany,00004-807-019,longest_gap_weeks,8
4,Germany,00004-807-019,n_segments,4
5,Germany,00019-003-003,n_weeks_obs,202
6,Germany,00019-003-003,span_weeks,202
7,Germany,00019-003-003,missing_weeks,0
8,Germany,00019-003-003,longest_gap_weeks,0
9,Germany,00019-003-003,n_segments,1


Unnamed: 0,country,sku,level_2,week_start
count,110,110,110,110.0
unique,6,22,5,
top,Romania,00004-807-019,n_weeks_obs,
freq,25,5,22,
mean,,,,116.118182
std,,,,140.416075
min,,,,0.0
25%,,,,2.0
50%,,,,18.0
75%,,,,222.25


In [8]:
series_cont.columns.tolist()


['country', 'sku', 'level_2', 'week_start']

In [9]:
series_cont = (
    series_cont
    .pivot(
        index=["country", "sku"],
        columns="level_2",
        values="week_start"
    )
    .reset_index()
)

series_cont.head()
series_cont.columns.tolist()


['country',
 'sku',
 'longest_gap_weeks',
 'missing_weeks',
 'n_segments',
 'n_weeks_obs',
 'span_weeks']

In [10]:
series_cont.sort_values(
    ["missing_weeks", "longest_gap_weeks", "n_segments"],
    ascending=False
).head()


level_2,country,sku,longest_gap_weeks,missing_weeks,n_segments,n_weeks_obs,span_weeks
13,Romania,76518-000-000,27,124,60,233,357
12,Romania,07808-016-000,4,60,46,271,331
20,Sweden,00397-117-000,44,50,5,307,357
15,Spain,00119-066-000,16,46,6,163,209
6,Portugal,00012-438-000,39,40,3,118,158


## 5. Zera i intermittent demand

In [11]:
def intermittent_metrics(df: pd.DataFrame) -> pd.Series:
    y = df.sort_values("week_start")["demand"].values
    n = len(y)

    zero_share = float((y == 0).mean())
    n_nonzero = int((y > 0).sum())

    nz_idx = np.where(y > 0)[0]
    if len(nz_idx) >= 2:
        gaps = np.diff(nz_idx)
        avg_gap_nonzero = float(gaps.mean())
        ADI = float(gaps.mean())
    else:
        avg_gap_nonzero = np.inf
        ADI = np.inf

    nz_vals = y[y > 0]
    if len(nz_vals) >= 2 and nz_vals.mean() > 0:
        CV2 = float((nz_vals.std(ddof=1) / nz_vals.mean()) ** 2)
    else:
        CV2 = np.inf

    return pd.Series({
        "zero_share": zero_share,
        "n_nonzero": n_nonzero,
        "avg_gap_nonzero": avg_gap_nonzero,
        "ADI": ADI,
        "CV2": CV2,
    })

interm = (
    df_level_a
    .groupby(["country", "sku"], observed=True)
    .apply(intermittent_metrics)
    .reset_index()
)

interm.head()



  .apply(intermittent_metrics)


Unnamed: 0,country,sku,zero_share,n_nonzero,avg_gap_nonzero,ADI,CV2
0,Germany,00004-807-019,0.0,188.0,1.0,1.0,0.122573
1,Germany,00019-003-003,0.0,202.0,1.0,1.0,0.03545
2,Poland,02589-489-000,0.0,357.0,1.0,1.0,0.037047
3,Poland,05243-022-000,0.0,338.0,1.0,1.0,0.100955
4,Poland,16333-000-000,0.0,205.0,1.0,1.0,0.279117


In [12]:
def demand_class(ADI, CV2):
    if np.isinf(ADI) or np.isinf(CV2):
        return "insufficient"
    if ADI <= 1.32 and CV2 <= 0.49:
        return "smooth"
    if ADI > 1.32 and CV2 <= 0.49:
        return "intermittent"
    if ADI > 1.32 and CV2 > 0.49:
        return "lumpy"
    return "erratic"

interm["demand_type"] = interm.apply(
    lambda r: demand_class(r["ADI"], r["CV2"]), axis=1
)

interm["demand_type"].value_counts(dropna=False)


demand_type
smooth     19
erratic     3
Name: count, dtype: int64

## 6. Outliery

In [13]:
def mad_outlier_flags(x: pd.Series, z=5.0) -> pd.Series:
    x = x.astype(float)
    med = x.median()
    mad = (x - med).abs().median()
    if mad == 0 or np.isnan(mad):
        return pd.Series(False, index=x.index)
    robust_z = 0.6745 * (x - med) / mad
    return robust_z.abs() > z

df_level_a = df_level_a.sort_values(["country", "sku", "week_start"])
df_level_a["is_outlier"] = (
    df_level_a
    .groupby(["country", "sku"], observed=True)["demand"]
    .apply(lambda s: mad_outlier_flags(s))
    .reset_index(level=[0,1], drop=True)
)

df_level_a["is_outlier"].mean()


np.float64(0.010154905335628227)

## 7. Testowanie regułu

In [14]:
series_registry = (
    series_cont
    .merge(interm, on=["country", "sku"], how="left")
)

series_registry["eligible"] = (
    (series_registry["n_weeks_obs"] >= 104) &
    (series_registry["longest_gap_weeks"] < 8) &
    (series_registry["zero_share"] < 0.8) &
    (series_registry["n_nonzero"] >= 30)
)

series_registry["eligible"].value_counts()


eligible
True     15
False     7
Name: count, dtype: int64

In [15]:
series_registry.loc[~series_registry["eligible"], [
    "country","sku","n_weeks_obs","longest_gap_weeks","zero_share","n_nonzero","demand_type"
]].sort_values(
    ["n_weeks_obs","longest_gap_weeks","zero_share"],
    ascending=[True, False, False]
).head(15)



Unnamed: 0,country,sku,n_weeks_obs,longest_gap_weeks,zero_share,n_nonzero,demand_type
6,Portugal,00012-438-000,118,39,0.016949,116.0,smooth
15,Spain,00119-066-000,163,16,0.01227,161.0,smooth
0,Germany,00004-807-019,188,8,0.0,188.0,smooth
4,Poland,16333-000-000,205,8,0.0,205.0,smooth
13,Romania,76518-000-000,233,27,0.004292,232.0,erratic
20,Sweden,00397-117-000,307,44,0.0,307.0,smooth
3,Poland,05243-022-000,338,15,0.0,338.0,smooth


In [16]:
df_variants = df_level_a.copy()

def winsorize(s):
    lo, hi = s.quantile([0.01, 0.99])
    return s.clip(lo, hi)

df_variants["demand_clipped"] = (
    df_variants
    .groupby(["country","sku"], observed=True)["demand"]
    .transform(winsorize)
)

df_variants["demand_log1p"] = np.log1p(df_variants["demand"])


In [17]:
REG_PATH = OUT_DIR / "series_registry.csv"
series_registry.to_csv(REG_PATH, index=False)

LEVELA_PATH = OUT_DIR / "series_level_a.parquet"
df_level_a.to_parquet(LEVELA_PATH, index=False)

VAR_PATH = OUT_DIR / "series_level_a_variants.parquet"
df_variants.to_parquet(VAR_PATH, index=False)

REG_PATH, LEVELA_PATH, VAR_PATH


(WindowsPath('data/prepared/series_registry.csv'),
 WindowsPath('data/prepared/series_level_a.parquet'),
 WindowsPath('data/prepared/series_level_a_variants.parquet'))