In [23]:

import os
import sys
import gc
import random
import subprocess
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error

try:
    import lightgbm as lgb
except ImportError:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'lightgbm', '--quiet'])
    import lightgbm as lgb

pd.options.display.max_columns = 120
SEED = 2025


def seed_everything(seed: int = SEED) -> None:
    random.seed(seed)
    np.random.seed(seed)


seed_everything()
DATA_DIR = Path('.')


In [24]:

train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
store = pd.read_csv(DATA_DIR / 'STORE_LOCATION.csv', sep=';')
sample_submission = pd.read_csv(DATA_DIR / 'sample_submission.csv')

print('Train shape:', train.shape)
print('Test shape:', test.shape)
print('Store shape:', store.shape)
print('Sample submission shape:', sample_submission.shape)


Train shape: (35344, 11)
Test shape: (1404, 5)
Store shape: (1208, 29)
Sample submission shape: (1200, 2)


In [25]:

from typing import List


def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = [c.lower() for c in out.columns]
    if 'unnamed: 0' in out.columns:
        out = out.rename(columns={'unnamed: 0': 'id'})
    return out


def prepare_store_metadata(raw_store: pd.DataFrame) -> pd.DataFrame:
    store_df = normalize_columns(raw_store)
    time_cols = [c for c in store_df.columns if c.endswith('_dttm')]
    for col in time_cols:
        store_df[col] = pd.to_datetime(store_df[col], format='%d%b%Y:%H:%M:%S', errors='coerce')
        store_df[f'{col}_year'] = store_df[col].dt.year
        store_df[f'{col}_month'] = store_df[col].dt.month
        store_df[f'{col}_isnull'] = store_df[col].isna().astype(np.int8)
    store_df = store_df.drop(columns=time_cols)

    hash_cols = [c for c in store_df.columns if 'hashing' in c]
    for col in hash_cols:
        store_df[col] = pd.factorize(store_df[col], sort=True)[0].astype('int32')
    return store_df


def add_group_lags(df: pd.DataFrame, group_cols: List[str], target_col: str, lags: List[int]) -> pd.DataFrame:
    for lag in lags:
        df[f'{target_col}_lag_{lag}'] = df.groupby(group_cols)[target_col].shift(lag)
    return df


def add_group_rolls(df: pd.DataFrame, group_cols: List[str], target_col: str, windows: List[int]) -> pd.DataFrame:
    for window in windows:
        df[f'{target_col}_roll_mean_{window}'] = (
            df.groupby(group_cols, group_keys=False)[target_col]
            .apply(lambda s: s.shift(1).rolling(window).mean())
        )
    return df


In [26]:

train_df = normalize_columns(train)
test_df = normalize_columns(test)
store_meta = prepare_store_metadata(store)

train_df['dataset'] = 'train'
test_df['dataset'] = 'test'
test_df['demand'] = np.nan

full = pd.concat([train_df, test_df], ignore_index=True, sort=False)
full['period_start_dt'] = pd.to_datetime(full['period_start_dt'], dayfirst=True, errors='coerce')
full = full.merge(store_meta, how='left', on='store_location_rk')

promo_cols = [c for c in ['promo1_flag', 'promo2_flag', 'autorization_flag'] if c in full.columns]
for col in promo_cols:
    full[col] = full[col].fillna(0)

full['num_consultant'] = full['num_consultant'].fillna(0)

full['price_regular'] = full.groupby('product_rk')['price_regular'].transform(lambda s: s.fillna(s.median()))
full['price_after_disc'] = full['price_after_disc'].fillna(full['price_regular'])
full['discount_amount'] = full['price_regular'] - full['price_after_disc']
full['discount_pct'] = (full['discount_amount'] / full['price_regular'].replace(0, np.nan)).fillna(0)
full['price_ratio'] = (full['price_after_disc'] / full['price_regular'].replace(0, np.nan)).fillna(1)


iso_calendar = full['period_start_dt'].dt.isocalendar()
full['weekofyear'] = iso_calendar.week.astype('Int16')
full['weekofyear'] = full['weekofyear'].fillna(-1).astype(int)
full['weekofyear'] = full['weekofyear'].replace(-1, np.nan)
full['month'] = full['period_start_dt'].dt.month
full['quarter'] = full['period_start_dt'].dt.quarter
full['dayofweek'] = full['period_start_dt'].dt.weekday
full['weekofmonth'] = (full['period_start_dt'].dt.day//7)
full['is_month_start'] = full['period_start_dt'].dt.is_month_start.fillna(False).astype(int)
full['is_month_end'] = full['period_start_dt'].dt.is_month_end.fillna(False).astype(int)
full['dayofyear'] = full['period_start_dt'].dt.dayofyear

train_mask = full['dataset'] == 'train'
store_mean = full.loc[train_mask].groupby('store_location_rk')['demand'].mean()
product_mean = full.loc[train_mask].groupby('product_rk')['demand'].mean()
combo_mean = full.loc[train_mask].groupby(['product_rk', 'store_location_rk'])['demand'].mean()
full['store_mean_demand'] = full['store_location_rk'].map(store_mean)
full['product_mean_demand'] = full['product_rk'].map(product_mean)
full['product_store_mean_demand'] = list(zip(full['product_rk'], full['store_location_rk']))
full['product_store_mean_demand'] = full['product_store_mean_demand'].map(combo_mean)

full = full.sort_values(['product_rk', 'store_location_rk', 'period_start_dt']).reset_index(drop=True)
full = add_group_lags(full, ['product_rk', 'store_location_rk'], 'demand', [1, 2, 3, 4, 7, 14, 21, 28])
full = add_group_rolls(full, ['product_rk', 'store_location_rk'], 'demand', [2, 4, 6, 8, 12, 26])
full['demand_ewm_0_3'] = (
    full.groupby(['product_rk', 'store_location_rk'], group_keys=False)['demand']
    .apply(lambda s: s.shift(1).ewm(alpha=0.3, adjust=False).mean())
)
full['demand_ewm_0_8'] = (
    full.groupby(['product_rk', 'store_location_rk'], group_keys=False)['demand']
    .apply(lambda s: s.shift(1).ewm(alpha=0.8, adjust=False).mean())
)
full['days_since_first_record'] = (
    full.groupby(['product_rk', 'store_location_rk'])['period_start_dt']
    .transform(lambda s: (s - s.min()).dt.days)
)

float_cols = full.select_dtypes(include=['float32', 'float64']).columns
full[float_cols] = full[float_cols].astype('float32')
print('Feature matrix shape:', full.shape)


  full['period_start_dt'] = pd.to_datetime(full['period_start_dt'], dayfirst=True, errors='coerce')


Feature matrix shape: (36748, 75)


In [27]:

full.sort_values('period_start_dt').head()


Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,promo1_flag,promo2_flag,price_regular,price_after_disc,num_consultant,autorization_flag,dataset,store_location_lvl_rk4,store_location_lvl_rk3,store_location_lvl_rk2,store_location_lvl_rk1,store_location_adk_hashing,store_location_attrib1_hashing,store_location_attrib2_hashing,store_location_attrib3_hashing,store_location_attrib4_hashing,store_location_attrib5_hashing,store_location_attrib6_hashing,store_location_attrib7_hashing,store_location_attrib8_hashing,store_location_attrib9_hashing,store_location_attrib10_hashing,store_location_attrib11_hashing,store_location_attrib12_hashing,store_location_attrib13_hashing,store_location_attrib14_hashing,store_location_attrib15_hashing,store_location_attrib16_hashing,store_location_attrib17_hashing,store_location_attrib18_hashing,store_location_attrib19_hashing,store_location_attrib20_hashing,store_location_attrib21_hashing,store_open_dttm_year,store_open_dttm_month,store_open_dttm_isnull,store_closure_dttm_year,store_closure_dttm_month,store_closure_dttm_isnull,discount_amount,discount_pct,price_ratio,weekofyear,month,quarter,dayofweek,weekofmonth,is_month_start,is_month_end,dayofyear,store_mean_demand,product_mean_demand,product_store_mean_demand,demand_lag_1,demand_lag_2,demand_lag_3,demand_lag_4,demand_lag_7,demand_lag_14,demand_lag_21,demand_lag_28,demand_roll_mean_2,demand_roll_mean_4,demand_roll_mean_6,demand_roll_mean_8,demand_roll_mean_12,demand_roll_mean_26,demand_ewm_0_3,demand_ewm_0_8,days_since_first_record
0,0,40369,309,2016-12-19,29.0,0.0,0.0,500.0,500.0,0.0,0.0,train,203,203,10,1,0,1,0,16,13,6,1,12,80,1,3,2,2,3,2,2,0,0,0,0,0,0,2018,3,0,,,1,0.0,0.0,1.0,51.0,12.0,4.0,0.0,2.0,0,0,354.0,55.666668,17.451267,68.666664,,,,,,,,,,,,,,,,,0.0
8455,12076,40370,562,2016-12-19,34.0,0.0,0.0,1000.0,1000.0,0.0,0.0,train,36,36,10,1,0,1,0,19,1,6,1,12,9,0,1,1,1,0,2,0,0,0,0,0,0,0,2018,3,0,,,1,0.0,0.0,1.0,51.0,12.0,4.0,0.0,2.0,0,0,354.0,12.556425,23.344059,22.978071,,,,,,,,,,,,,,,,,0.0
28371,17655,46272,862,2016-12-19,17.0,0.0,0.0,223.0,223.0,0.0,0.0,train,54,54,10,1,0,1,0,8,1,6,1,12,55,0,1,1,1,0,2,2,0,0,0,0,0,0,2018,3,0,,,1,0.0,0.0,1.0,51.0,12.0,4.0,0.0,2.0,0,0,354.0,14.818611,6.912034,10.750141,,,,,,,,,,,,,,,,,0.0
8290,11149,40370,557,2016-12-19,54.0,0.0,0.0,1000.0,1000.0,0.0,0.0,train,54,54,10,1,0,1,2,16,1,6,1,12,55,0,1,1,1,0,2,2,0,0,0,0,0,0,2018,3,0,,,1,0.0,0.0,1.0,51.0,12.0,4.0,0.0,2.0,0,0,354.0,16.261204,23.344059,32.703659,,,,,,,,,,,,,,,,,0.0
28536,18584,46272,866,2016-12-19,26.0,0.0,0.0,223.0,223.0,0.0,0.0,train,219,219,10,1,0,1,0,8,55,6,1,12,55,0,1,1,1,0,2,2,0,0,0,0,0,0,2018,3,0,,,1,0.0,0.0,1.0,51.0,12.0,4.0,0.0,2.0,0,0,354.0,10.671259,6.912034,7.851871,,,,,,,,,,,,,,,,,0.0


In [28]:

train_processed = full[(full['dataset'] == 'train') & (full['demand'].notna())].copy()
test_processed = full[full['dataset'] == 'test'].copy()

feature_drop = {'dataset', 'demand', 'period_start_dt'}
if 'id' in feature_drop:
    feature_drop.remove('id')
feature_cols = [c for c in train_processed.columns if c not in feature_drop.union({'id'})]

for df in (train_processed, test_processed):
    df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)

for col in feature_cols:
    median_value = train_processed[col].median()
    train_processed[col] = train_processed[col].fillna(median_value)
    test_processed[col] = test_processed[col].fillna(median_value)

train_processed[feature_cols] = train_processed[feature_cols].fillna(0)
test_processed[feature_cols] = test_processed[feature_cols].fillna(0)

train_processed = train_processed.sort_values('period_start_dt')
max_date = train_processed['period_start_dt'].max()
val_start = max_date - pd.Timedelta(days=28)
train_mask = train_processed['period_start_dt'] < val_start
valid_mask = ~train_mask

train_data = train_processed.loc[train_mask]
valid_data = train_processed.loc[valid_mask]

print('Train span:', train_data['period_start_dt'].min(), '->', train_data['period_start_dt'].max())
print('Valid span:', valid_data['period_start_dt'].min(), '->', valid_data['period_start_dt'].max())
print('Feature count:', len(feature_cols))


Train span: 2016-12-19 00:00:00 -> 2019-11-25 00:00:00
Valid span: 2019-12-02 00:00:00 -> 2019-12-30 00:00:00
Feature count: 71


In [29]:

lgb_train = lgb.Dataset(train_data[feature_cols], label=train_data['demand'])
lgb_valid = lgb.Dataset(valid_data[feature_cols], label=valid_data['demand'])

params = {
    'objective': 'regression_l1',
    'metric': 'mae',
    'learning_rate': 0.03,
    'num_leaves': 256,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_data_in_leaf': 40,
    'lambda_l1': 0.01,
    'lambda_l2': 1.0,
    'seed': SEED,
    'n_jobs': -1,
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    num_boost_round=5000,
    callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)],
)

valid_pred = model.predict(valid_data[feature_cols], num_iteration=model.best_iteration)
val_mae = mean_absolute_error(valid_data['demand'], valid_pred)
print(f'Validation MAE: {val_mae:.4f}')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6000
[LightGBM] [Info] Number of data points in the train set: 34144, number of used features: 48
[LightGBM] [Info] Start training from score 6.000000
Training until validation scores don't improve for 200 rounds
[200]	train's l1: 4.18518	valid's l1: 13.8031
Early stopping, best iteration is:
[1]	train's l1: 9.99981	valid's l1: 6.08988


ValueError: Input contains NaN.

In [None]:

fi = pd.DataFrame({
    'feature': feature_cols,
    'importance_gain': model.feature_importance(importance_type='gain')
}).sort_values('importance_gain', ascending=False)
fi.head(25)


In [None]:

test_pred = model.predict(test_processed[feature_cols], num_iteration=model.best_iteration)
submission = pd.DataFrame({
    'id': test_processed['id'].astype(int),
    'predicted': np.clip(test_pred, 0, None)
})
submission.to_csv('bebebe.csv', index=False)
print('Saved submission.csv with shape:', submission.shape)
submission.head()
