In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from scipy import stats
import statsmodels.api as sm


In [2]:
merged_data_path = os.path.join(os.getcwd(), 'data', 'merged_data')
master_train_path = os.path.join(merged_data_path, 'master_train.csv')
master_forecast_path = os.path.join(merged_data_path, 'master_forecast.csv')
df_master_train = pd.read_csv(master_train_path, low_memory=False)
df_master_train['date'] = pd.to_datetime(df_master_train['date'])
df_master_train = df_master_train.convert_dtypes()
df_master_forecast = pd.read_csv(master_forecast_path, low_memory=False)
df_master_forecast['date'] = pd.to_datetime(df_master_forecast['date'])
df_master_forecast = df_master_forecast.convert_dtypes()

In [3]:
# Missing data check - now that we have filled the null discountPct
missing_data = df_master_train.isnull().sum()
missing_pct = (missing_data / len(df_master_train)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_pct
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

print(missing_df)

# Save the null-handled training data back to the same file
df_master_train.to_csv(master_train_path, index=False)


                  Missing Count  Missing Percentage
promo_id                 297837           86.618545
discountPct              297837           86.618545
holidayEventName         271609           78.990778
FSC_index                    18            0.005235


In [None]:
df = df_master_train.copy()

df = df.drop(columns=["promo_id"], errors="ignore")
df = df.dropna(subset=["FSC_index"])                 # drop obs if FSC_index is null
df["discountPct"] = df["discountPct"].fillna(0)      # set discountPct null to 0
df["holidayEventName"] = df["holidayEventName"].fillna("__NO_EVENT__")

cat_cols = ["category", "mainProductGroup", "holidayEventName", "doWName"]

for c in cat_cols:
    if c in df.columns:
        print(f"\n{c} value counts:")
        print(df[c].value_counts())



category value counts:
category
Wijn en PSV             108487
Vlees, Kip en Vis       106535
Was Reiniging Papier     64454
Zoetwaren                64355
Name: count, dtype: Int64

mainProductGroup value counts:
mainProductGroup
Rood stevig                   22793
Rood soepel                   21659
others - Zoetwaren            21643
Vis                           21565
Wasmiddelen                   21560
Afwas                         21450
others - Vlees, Kip en Vis    21447
Vleesconserven                21446
Lucht                         21444
chocolade                     21418
others - Wijn en PSV          21380
Wit vol                       21342
Wit fris&fruitig              21313
suikerwerk                    21294
Visconserven                  21128
Vlees                         20949
Name: count, dtype: Int64

holidayEventName value counts:
holidayEventName
__NO_EVENT__             271608
Sinterklaas_lag_5           973
Nieuwjaar_lead_1            963
Eerste Paasdag_lag_1 

In [None]:
# Define desired baselines
baseline_map = {
    "category": "Wijn en PSV",
    "mainProductGroup": "Other",
    "holidayEventName": "__NO_EVENT__",
    "doWName": "Maandag"
}

for col, baseline in baseline_map.items():
    if col in df.columns:
        # Get all existing levels
        levels = df[col].astype("string").unique().tolist()
        # Put baseline first, keep others after
        ordered_levels = [baseline] + [x for x in levels if x != baseline]
        # Enforce category order
        df[col] = pd.Categorical(df[col], categories=ordered_levels, ordered=True)
