# HW3: Demand Forecasting

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.metrics import mean_absolute_error
warnings.filterwarnings('ignore')

SEED = 2025
np.random.seed(SEED)

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = np.clip((np.abs(y_true) + np.abs(y_pred)) / 2.0, 1e-9, None)
    return np.mean(numerator / denominator) * 100

In [None]:
all_data = pd.read_csv('train.csv')
test_ids_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

all_data['period_start_dt'] = pd.to_datetime(all_data['period_start_dt'], format='%Y-%m-%d')
all_data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)

del all_data['PROMO2_FLAG']
del all_data['NUM_CONSULTANT']
all_data['PROMO1_FLAG'] = all_data['PROMO1_FLAG'].fillna(0)
all_data['AUTORIZATION_FLAG'] = all_data['AUTORIZATION_FLAG'].fillna(0)
all_data['PRICE_REGULAR'] = all_data['PRICE_REGULAR'].fillna(all_data['PRICE_REGULAR'].median())
all_data['PRICE_AFTER_DISC'] = all_data['PRICE_AFTER_DISC'].fillna(all_data['PRICE_AFTER_DISC'].median())
all_data = all_data[all_data['store_location_rk'] != 309].copy()

Data: (35329, 9)


In [None]:
all_data['month'] = all_data['period_start_dt'].dt.month
all_data['dayofweek'] = all_data['period_start_dt'].dt.dayofweek
all_data['dayofyear'] = all_data['period_start_dt'].dt.dayofyear
all_data['weekofyear'] = all_data['period_start_dt'].dt.isocalendar().week.astype(int)
all_data['quarter'] = all_data['period_start_dt'].dt.quarter
all_data['is_weekend'] = (all_data['dayofweek'] >= 5).astype(int)
all_data['is_month_start'] = all_data['period_start_dt'].dt.is_month_start.astype(int)
all_data['is_month_end'] = all_data['period_start_dt'].dt.is_month_end.astype(int)

all_data['discount'] = (all_data['PRICE_REGULAR'] - all_data['PRICE_AFTER_DISC']).clip(lower=0)
all_data['discount_pct'] = (all_data['discount'] / all_data['PRICE_REGULAR'].replace(0, np.nan)).fillna(0)

all_data['log_price_regular'] = np.log1p(all_data['PRICE_REGULAR'])
all_data['log_price_after_disc'] = np.log1p(all_data['PRICE_AFTER_DISC'])

In [13]:
all_data = all_data.sort_values(['product_rk', 'store_location_rk', 'period_start_dt']).reset_index(drop=True)

for lag in [7, 14, 21, 28]:
    all_data[f'demand_lag_{lag}'] = all_data.groupby(['product_rk', 'store_location_rk'])['demand'].shift(lag)

for window in [7, 14, 28]:
    all_data[f'demand_roll_mean_{window}'] = all_data.groupby(['product_rk', 'store_location_rk'])['demand'].transform(
        lambda x: x.shift(1).rolling(window=window, min_periods=1).mean()
    )
    all_data[f'demand_roll_std_{window}'] = all_data.groupby(['product_rk', 'store_location_rk'])['demand'].transform(
        lambda x: x.shift(1).rolling(window=window, min_periods=1).std()
    )

for alpha in [0.5, 0.7]:
    all_data[f'demand_ewm_{alpha}'] = all_data.groupby(['product_rk', 'store_location_rk'])['demand'].transform(
        lambda x: x.shift(1).ewm(alpha=alpha, min_periods=1).mean()
    )

In [None]:
train_only = all_data[all_data['demand'].notna()].copy()

product_stats = train_only.groupby('product_rk')['demand'].agg(['mean', 'std', 'median']).reset_index()
product_stats.columns = ['product_rk', 'product_mean', 'product_std', 'product_median']
all_data = all_data.merge(product_stats, on='product_rk', how='left')

store_stats = train_only.groupby('store_location_rk')['demand'].agg(['mean', 'std']).reset_index()
store_stats.columns = ['store_location_rk', 'store_mean', 'store_std']
all_data = all_data.merge(store_stats, on='store_location_rk', how='left')

product_month_stats = train_only.groupby(['product_rk', 'month'])['demand'].mean().reset_index()
product_month_stats.columns = ['product_rk', 'month', 'product_month_mean']
all_data = all_data.merge(product_month_stats, on=['product_rk', 'month'], how='left')

product_dow_stats = train_only.groupby(['product_rk', 'dayofweek'])['demand'].mean().reset_index()
product_dow_stats.columns = ['product_rk', 'dayofweek', 'product_dow_mean']
all_data = all_data.merge(product_dow_stats, on=['product_rk', 'dayofweek'], how='left')

In [None]:
all_data['promo_x_discount'] = all_data['PROMO1_FLAG'] * all_data['discount_pct']
all_data['auth_x_discount'] = all_data['AUTORIZATION_FLAG'] * all_data['discount_pct']

## Advanced Feature Engineering


In [None]:
for lag in [35, 42, 49, 56]:
    all_data[f'demand_lag_{lag}'] = all_data.groupby(['product_rk', 'store_location_rk'])['demand'].shift(lag)

all_data['demand_expanding_mean'] = all_data.groupby(['product_rk', 'store_location_rk'])['demand'].transform(
    lambda x: x.shift(1).expanding(min_periods=1).mean()
)
all_data['demand_expanding_std'] = all_data.groupby(['product_rk', 'store_location_rk'])['demand'].transform(
    lambda x: x.shift(1).expanding(min_periods=1).std()
)

for window in [7, 14, 28]:
    mean_col = f'demand_roll_mean_{window}'
    std_col = f'demand_roll_std_{window}'
    all_data[f'demand_cv_{window}'] = all_data[std_col] / (all_data[mean_col] + 1)


In [None]:
all_data['price_ratio'] = all_data['PRICE_AFTER_DISC'] / (all_data['PRICE_REGULAR'] + 1)
all_data['price_change_rate'] = all_data.groupby(['product_rk', 'store_location_rk'])['PRICE_REGULAR'].pct_change()

product_store_mean = all_data.groupby(['product_rk', 'store_location_rk'])['demand'].transform('mean')
all_data['product_store_demand_mean'] = product_store_mean

all_data['week_year'] = all_data['weekofyear'] + all_data['period_start_dt'].dt.year * 100

all_data['is_holiday_season'] = ((all_data['month'] == 11) | (all_data['month'] == 12)).astype(int)

all_data['is_year_start'] = (all_data['weekofyear'] <= 2).astype(int)
all_data['is_year_end'] = (all_data['weekofyear'] >= 50).astype(int)


In [None]:
train_df = all_data[all_data['demand'].notna()].copy()
test_df = all_data[all_data['demand'].isna()].copy()

train_df = train_df.sort_values('period_start_dt')
split_date = train_df['period_start_dt'].quantile(0.85)

X_train = train_df[train_df['period_start_dt'] < split_date].copy()
X_val = train_df[train_df['period_start_dt'] >= split_date].copy()

feature_cols = [c for c in all_data.columns if c not in 
                ['id', 'period_start_dt', 'demand', 'Unnamed: 0']]

for col in feature_cols:
    X_train[col] = X_train[col].fillna(0).replace([np.inf, -np.inf], 0)
    X_val[col] = X_val[col].fillna(0).replace([np.inf, -np.inf], 0)
    test_df[col] = test_df[col].fillna(0).replace([np.inf, -np.inf], 0)

In [None]:
dtrain = xgb.DMatrix(X_train[feature_cols], label=X_train['demand'])
dval = xgb.DMatrix(X_val[feature_cols], label=X_val['demand'])

xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'max_depth': 7,
    'learning_rate': 0.03,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'min_child_weight': 3,
    'gamma': 0.05,
    'reg_alpha': 0.05,
    'reg_lambda': 1.5,
    'tree_method': 'hist',
    'seed': SEED,
    'verbosity': 0
}

evals = [(dtrain, 'train'), (dval, 'val')]

xgb_model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=3000,
    evals=evals,
    early_stopping_rounds=150,
    verbose_eval=200
)

In [None]:
lgb_train = lgb.Dataset(X_train[feature_cols], label=X_train['demand'])
lgb_val = lgb.Dataset(X_val[feature_cols], label=X_val['demand'], reference=lgb_train)

lgb_params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 63,
    'learning_rate': 0.03,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'lambda_l1': 0.05,
    'lambda_l2': 1.5,
    'seed': SEED,
    'verbosity': -1
}

lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=3000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'val'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=150),
        lgb.log_evaluation(period=200)
    ]
)

cb_model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=7,
    l2_leaf_reg=1.5,
    random_strength=0.5,
    bagging_temperature=0.2,
    od_type='Iter',
    od_wait=150,
    random_seed=SEED,
    verbose=200,
    loss_function='MAE'
)

cb_model.fit(
    X_train[feature_cols], 
    X_train['demand'],
    eval_set=(X_val[feature_cols], X_val['demand']),
    use_best_model=True
)


In [None]:
xgb_val_pred = xgb_model.predict(dval, iteration_range=(0, xgb_model.best_iteration + 1))
xgb_val_pred = np.clip(xgb_val_pred, 0, None)

lgb_val_pred = lgb_model.predict(X_val[feature_cols], num_iteration=lgb_model.best_iteration)
lgb_val_pred = np.clip(lgb_val_pred, 0, None)

cb_val_pred = cb_model.predict(X_val[feature_cols])
cb_val_pred = np.clip(cb_val_pred, 0, None)

xgb_smape = smape(X_val['demand'].values, xgb_val_pred)
lgb_smape = smape(X_val['demand'].values, lgb_val_pred)
cb_smape = smape(X_val['demand'].values, cb_val_pred)

weights = np.array([1/xgb_smape, 1/lgb_smape, 1/cb_smape])
weights = weights / weights.sum()

ensemble_val_pred = (weights[0] * xgb_val_pred + 
                     weights[1] * lgb_val_pred + 
                     weights[2] * cb_val_pred)
ensemble_val_pred = np.clip(ensemble_val_pred, 0, None)

ensemble_smape = smape(X_val['demand'].values, ensemble_val_pred)

In [None]:
dtest = xgb.DMatrix(test_df[feature_cols])
xgb_test_pred = xgb_model.predict(dtest, iteration_range=(0, xgb_model.best_iteration + 1))
xgb_test_pred = np.clip(xgb_test_pred, 0, None)

lgb_test_pred = lgb_model.predict(test_df[feature_cols], num_iteration=lgb_model.best_iteration)
lgb_test_pred = np.clip(lgb_test_pred, 0, None)

cb_test_pred = cb_model.predict(test_df[feature_cols])
cb_test_pred = np.clip(cb_test_pred, 0, None)

ensemble_test_pred = (weights[0] * xgb_test_pred + 
                      weights[1] * lgb_test_pred + 
                      weights[2] * cb_test_pred)
ensemble_test_pred = np.clip(ensemble_test_pred, 0, None)

# Create submission
pred_df = pd.DataFrame({'id': test_df['id'].astype(int), 'predicted': ensemble_test_pred})

test_ids_set = set(test_ids_df['id'].values)
submission = sample_submission[['id']].merge(pred_df[pred_df['id'].isin(test_ids_set)], on='id', how='left')

# Fill missing values with product means
product_means = train_df.groupby('product_rk')['demand'].mean().to_dict()
for idx, row in submission.iterrows():
    if pd.isna(row['predicted']):
        matching = test_df[test_df['id'] == row['id']]
        if len(matching) > 0:
            prod_rk = matching.iloc[0]['product_rk']
            submission.at[idx, 'predicted'] = product_means.get(prod_rk, train_df['demand'].mean())

submission['predicted'] = submission['predicted'].fillna(train_df['demand'].mean())

# Save submission
submission.to_csv('submission_ensemble.csv', index=False)
print(f"âœ… Submission saved to 'submission_ensemble.csv'")
print(f"ðŸ“Š Submission shape: {submission.shape}")
print(f"ðŸ“Š Sample predictions:\n{submission.head(10)}")
