In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
import tqdm
import optuna

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('data/raw/train.csv', parse_dates=['Date'])
sample_submission = pd.read_csv('data/raw/sample_submission.csv')

In [4]:
train['Date'] = pd.to_datetime(train['Date'].apply(lambda x: x[:-2] + '01'), format='%Y-%m-%d')
train['Month'] = train['Date'].dt.to_period('M').astype(str)

In [5]:
monthly_agg = (
    train
    .groupby(['Company_ID', 'Product_ID', 'Month'], as_index=False)
    .agg({'Target': 'sum'})
    .rename(columns={'Target': 'sum_sales'})
    .sort_values(by=['Company_ID', 'Product_ID', 'Month'])
)

In [6]:
d = {'Company_ID': [], 'Product_ID': [], 'Month': []}
ar = monthly_agg['Company_ID'].sort_values().unique()
br = monthly_agg['Product_ID'].sort_values().unique()
cr = monthly_agg['Month'].sort_values().unique()
for company_id in ar:
    for product_id in br:
        for month in cr:
            d['Company_ID'].append(company_id)
            d['Product_ID'].append(product_id)
            d['Month'].append(month)
d = pd.DataFrame(d)
monthly_agg = d.merge(monthly_agg, 'left', on=['Company_ID', 'Product_ID', 'Month']).fillna(0)

In [7]:
monthly_agg_saved = monthly_agg.copy()
monthly_agg

Unnamed: 0,Company_ID,Product_ID,Month,sum_sales
0,0,0,2019-01,0.0
1,0,0,2019-02,0.0
2,0,0,2019-03,0.0
3,0,0,2019-04,0.0
4,0,0,2019-05,0.0
...,...,...,...,...
3459115,3,14668,2023-08,0.0
3459116,3,14668,2023-09,0.0
3459117,3,14668,2023-10,0.0
3459118,3,14668,2023-11,0.0


In [8]:
monthly_agg['Month_date'] = pd.to_datetime(monthly_agg['Month'], format='%Y-%m')

n_lags = 15

subset = []
gb = monthly_agg.groupby(['Company_ID','Product_ID'])
for i in range(n_lags + 1):
    monthly_agg[f'prev_sum_sales_{i}'] = gb['sum_sales'].shift(i)
    subset.append(f'prev_sum_sales_{i}')
    if i > 0:
        if i + 1 <= 9:
            feats_to_mean = [f'prev_sum_sales_{j}' for j in range(i + 1)]
            monthly_agg[f'rolling_sum_sales_{i + 1}'] = monthly_agg[feats_to_mean].mean(skipna=False, axis=1)
            subset.append(f'rolling_sum_sales_{i + 1}')
        monthly_agg[f'delta_sum_sales_{i}'] = monthly_agg['prev_sum_sales_0'] - monthly_agg[f'prev_sum_sales_{i}']
        subset.append(f'delta_sum_sales_{i}')

monthly_agg['target'] = gb['sum_sales'].shift(-1)

In [9]:
monthly_agg.dropna(subset=['target'], inplace=True)

In [10]:
monthly_agg

Unnamed: 0,Company_ID,Product_ID,Month,sum_sales,Month_date,prev_sum_sales_0,prev_sum_sales_1,rolling_sum_sales_2,delta_sum_sales_1,prev_sum_sales_2,...,delta_sum_sales_11,prev_sum_sales_12,delta_sum_sales_12,prev_sum_sales_13,delta_sum_sales_13,prev_sum_sales_14,delta_sum_sales_14,prev_sum_sales_15,delta_sum_sales_15,target
0,0,0,2019-01,0.0,2019-01-01,0.0,,,,,...,,,,,,,,,,0.0
1,0,0,2019-02,0.0,2019-02-01,0.0,0.0,0.0,0.0,,...,,,,,,,,,,0.0
2,0,0,2019-03,0.0,2019-03-01,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
3,0,0,2019-04,0.0,2019-04-01,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
4,0,0,2019-05,0.0,2019-05-01,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3459114,3,14668,2023-07,0.0,2023-07-01,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3459115,3,14668,2023-08,0.0,2023-08-01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3459116,3,14668,2023-09,0.0,2023-09-01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3459117,3,14668,2023-10,0.0,2023-10-01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def predict_next_month(df, model):
    df = df.copy()
    df['Month_date'] = pd.to_datetime(df['Month'], format='%Y-%m')
    df['Month_pred'] = (df['Month_date'] + pd.offsets.MonthBegin(1)).dt.to_period('M').astype(str)
    df = df.sort_values(by='Month_date')

    gb = df.groupby(['Company_ID', 'Product_ID'])
    for i in range(n_lags + 1):
        df[f'prev_sum_sales_{i}'] = gb['sum_sales'].shift(i)
        if i > 0:
            if i + 1 <= 9:
                feats_to_mean = [f'prev_sum_sales_{j}' for j in range(i + 1)]
                df[f'rolling_sum_sales_{i + 1}'] = df[feats_to_mean].mean(skipna=False, axis=1)
            df[f'delta_sum_sales_{i}'] = df['prev_sum_sales_0'] - df[f'prev_sum_sales_{i}']

    df['Target'] = gb['sum_sales'].shift(-1)

    to_predict = df['Target'].isna()
    df.loc[to_predict, 'Target'] = model.predict(df.loc[to_predict, feature_cols + cat_cols]).round()
    df.loc[to_predict, 'Target'] = df.loc[to_predict, 'Target'].apply(lambda x: max(x, 0))
    return df.loc[to_predict, ['Company_ID', 'Product_ID', 'Month_pred', 'Target']]

In [12]:
# Сортируем по Month_date (по возрастанию времени)
monthly_agg.sort_values(by=['Month_date'], inplace=True)

# Пример: берём валидацию на последние 3 месяца, которые у нас есть в данных
all_months = sorted(monthly_agg['Month_date'].unique())
N_valid_months = 3
valid_threshold = all_months[-N_valid_months]  # первая из последних 3
train_data = monthly_agg[monthly_agg['Month_date'] < valid_threshold].sample(frac=1, random_state=42)
valid_data = monthly_agg[monthly_agg['Month_date'] >= valid_threshold]

scale_factor = (monthly_agg_saved[monthly_agg_saved['Month'] == '2023-12']
                .groupby(['Company_ID', 'Product_ID'])['sum_sales']
                .agg(lambda x: int(x.sum() > 0)).sum() * 3) / len(sample_submission)

In [13]:
randids = np.random.choice(monthly_agg['Product_ID'].unique(), 3000)
tune_data = monthly_agg[monthly_agg['Product_ID'].isin(randids)]
tune_train = tune_data[tune_data['Month_date'] < valid_threshold].sample(frac=1, random_state=42)
tune_valid = tune_data[tune_data['Month_date'] >= valid_threshold]

feature_cols = subset
cat_cols = ['Company_ID', 'Product_ID']

X_train_tune = tune_train[tune_train['target'] != 0][feature_cols + cat_cols]
y_train_tune = tune_train[tune_train['target'] != 0]['target']

X_valid_tune = tune_valid[tune_valid['target'] != 0][feature_cols + cat_cols]
y_valid_tune = tune_valid[tune_valid['target'] != 0]['target']

cat_features_indices = [X_train_tune.columns.get_loc(c) for c in cat_cols]

mag = monthly_agg_saved[monthly_agg_saved['Product_ID'].isin(randids)].copy()
valid_saved = mag[mag['sum_sales'] != 0][pd.to_datetime(mag['Month'], format='%Y-%m') >= valid_threshold].copy()
mag_saved = mag[mag['sum_sales'] != 0][pd.to_datetime(mag['Month'], format='%Y-%m') < valid_threshold].copy()

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 1800),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'depth': trial.suggest_int('depth', 4, 14),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10),
        'cat_features': trial.suggest_categorical('cat_features', [cat_features_indices]),
        'loss_function': trial.suggest_categorical('loss_function', ['MAE']),
        'eval_metric': trial.suggest_categorical('eval_metric', ['MAE']),
        'random_seed': trial.suggest_categorical('random_seed', [42]),
        'verbose': 0
    }

    model = CatBoostRegressor(**params)

    model.fit(
        X_train_tune,
        y_train_tune,
        eval_set=(X_valid_tune, y_valid_tune),
        early_stopping_rounds=100
    )

    mag = mag_saved.copy()
    valid_data = valid_saved.copy()

    preds = []
    for i in range(N_valid_months):
        p = predict_next_month(mag, model)
        p.columns = ['Company_ID', 'Product_ID', 'Month', 'sum_sales']
        preds.append(p)
        mag = pd.concat([mag, p], axis=0)

    preds = pd.concat(preds, axis=0)
    merged = valid_data.merge(preds, on=['Company_ID', 'Product_ID', 'Month'], suffixes=('_actual', '_pred'))
    raw_mae = mean_absolute_error(merged['sum_sales_actual'], merged['sum_sales_pred'])

    return raw_mae * scale_factor

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print("Лучшие найденные параметры:")
print(study.best_params)

[I 2025-02-03 20:36:55,224] A new study created in memory with name: no-name-3051029e-d416-400b-837b-70141c26376e
[I 2025-02-03 20:37:28,720] Trial 0 finished with value: 12.096878883021214 and parameters: {'iterations': 1603, 'learning_rate': 0.035405191410459357, 'depth': 10, 'subsample': 0.7435675778029962, 'colsample_bylevel': 0.7622794644536715, 'l2_leaf_reg': 7.650134745917095, 'cat_features': [39, 40], 'loss_function': 'MAE', 'eval_metric': 'MAE', 'random_seed': 42}. Best is trial 0 with value: 12.096878883021214.
[I 2025-02-03 20:44:30,769] Trial 1 finished with value: 12.22308022737863 and parameters: {'iterations': 1689, 'learning_rate': 0.00721573182779872, 'depth': 14, 'subsample': 0.6556242916554862, 'colsample_bylevel': 0.616115499191513, 'l2_leaf_reg': 0.8980111971416116, 'cat_features': [39, 40], 'loss_function': 'MAE', 'eval_metric': 'MAE', 'random_seed': 42}. Best is trial 0 with value: 12.096878883021214.
[I 2025-02-03 20:45:11,052] Trial 2 finished with value: 13.65

Лучшие найденные параметры:
{'iterations': 1668, 'learning_rate': 0.028153280496855355, 'depth': 12, 'subsample': 0.8835510488789271, 'colsample_bylevel': 0.9520646709353786, 'l2_leaf_reg': 4.069350024709821, 'cat_features': [39, 40], 'loss_function': 'MAE', 'eval_metric': 'MAE', 'random_seed': 42}


In [14]:
cb_params = study.best_params
cb_params = {
    'iterations': 1668,
    'learning_rate': 0.028153280496855355,
    'depth': 12,
    'subsample': 0.8835510488789271,
    'colsample_bylevel': 0.9520646709353786,
    'l2_leaf_reg': 4.069350024709821,
    'cat_features': [39, 40],
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'random_seed': 42
}
cb_params

{'iterations': 1668,
 'learning_rate': 0.028153280496855355,
 'depth': 12,
 'subsample': 0.8835510488789271,
 'colsample_bylevel': 0.9520646709353786,
 'l2_leaf_reg': 4.069350024709821,
 'cat_features': [39, 40],
 'loss_function': 'MAE',
 'eval_metric': 'MAE',
 'random_seed': 42}

In [15]:
feature_cols = subset
cat_cols = ['Company_ID', 'Product_ID']

X_train = train_data[train_data['target'] != 0][feature_cols + cat_cols]
y_train = train_data[train_data['target'] != 0]['target']

X_valid = valid_data[valid_data['target'] != 0][feature_cols + cat_cols]
y_valid = valid_data[valid_data['target'] != 0]['target']

# Укажем, какие столбцы считать категориальными для CatBoost
# (CatBoost умеет работать с ними напрямую, выучивая таргет-кодирование)
cat_features_indices = [X_train.columns.get_loc(c) for c in cat_cols]  # индексы категориальных фич

In [16]:
X_train

Unnamed: 0,prev_sum_sales_0,prev_sum_sales_1,rolling_sum_sales_2,delta_sum_sales_1,prev_sum_sales_2,rolling_sum_sales_3,delta_sum_sales_2,prev_sum_sales_3,rolling_sum_sales_4,delta_sum_sales_3,...,prev_sum_sales_12,delta_sum_sales_12,prev_sum_sales_13,delta_sum_sales_13,prev_sum_sales_14,delta_sum_sales_14,prev_sum_sales_15,delta_sum_sales_15,Company_ID,Product_ID
29490,0.0,2.0,1.0,-2.0,0.0,0.666667,0.0,0.0,0.50,0.0,...,2.0,-2.0,6.0,-6.0,3.0,-3.0,0.0,0.0,0,499
1229635,1.0,1.0,1.0,0.0,0.0,0.666667,1.0,2.0,1.00,-1.0,...,0.0,1.0,2.0,-1.0,0.0,1.0,2.0,-1.0,1,6182
1560952,3.0,3.0,3.0,0.0,2.0,2.666667,1.0,2.0,2.50,1.0,...,0.0,3.0,0.0,3.0,0.0,3.0,2.0,1.0,1,11802
1724800,1.0,1.0,1.0,0.0,0.0,0.666667,1.0,0.0,0.50,1.0,...,0.0,1.0,2.0,-1.0,1.0,0.0,0.0,1.0,1,14588
3174609,1.0,0.0,0.5,1.0,0.0,0.333333,1.0,1.0,0.50,0.0,...,,,,,,,,,3,9840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2970545,3.0,2.0,2.5,1.0,1.0,2.000000,2.0,0.0,1.50,3.0,...,,,,,,,,,3,6375
2099764,7.0,6.0,6.5,1.0,25.0,12.666667,-18.0,18.0,14.00,-11.0,...,,,,,,,,,2,6274
1385092,1.0,0.0,0.5,1.0,0.0,0.333333,1.0,0.0,0.25,1.0,...,0.0,1.0,0.0,1.0,125.0,-124.0,2.0,-1.0,1,8828
2595552,7.0,3.0,5.0,4.0,0.0,3.333333,7.0,7.0,4.25,0.0,...,3.0,4.0,,,,,,,3,21


In [17]:
model = CatBoostRegressor(**cb_params, verbose=100)

model.fit(
    X_train,
    y_train,
    eval_set=(X_valid, y_valid),
    early_stopping_rounds=100
)

0:	learn: 43.5551200	test: 48.2706738	best: 48.2706738 (0)	total: 581ms	remaining: 16m 8s
100:	learn: 23.8281971	test: 28.1906329	best: 28.1906329 (100)	total: 35.1s	remaining: 9m 5s
200:	learn: 21.9938100	test: 25.7978374	best: 25.7978374 (200)	total: 1m 6s	remaining: 8m 8s
300:	learn: 21.5213751	test: 25.2003152	best: 25.2003152 (300)	total: 1m 39s	remaining: 7m 30s
400:	learn: 21.3687896	test: 25.0557608	best: 25.0557608 (400)	total: 2m 13s	remaining: 7m 2s
500:	learn: 21.1773561	test: 24.8895871	best: 24.8895871 (500)	total: 2m 46s	remaining: 6m 26s
600:	learn: 20.5455543	test: 24.2485588	best: 24.2452404 (597)	total: 3m 14s	remaining: 5m 45s
700:	learn: 20.1381180	test: 24.0674396	best: 24.0667661 (698)	total: 3m 41s	remaining: 5m 4s
800:	learn: 19.5450196	test: 23.8708566	best: 23.8708566 (800)	total: 4m 9s	remaining: 4m 30s
900:	learn: 18.9912225	test: 23.7901672	best: 23.7774966 (890)	total: 4m 38s	remaining: 3m 57s
1000:	learn: 18.2206122	test: 23.6669993	best: 23.6669993 (100

<catboost.core.CatBoostRegressor at 0x7ad859547910>

In [18]:
y_pred_valid = model.predict(X_valid).round()
mae_valid = mean_absolute_error(y_valid, y_pred_valid)
print("Raw MAE (valid):    ", mae_valid)
print("Scaled MAE (valid): ", mae_valid * scale_factor)

Raw MAE (valid):     23.513392763940793
Scaled MAE (valid):  9.58453272708063


In [19]:
mag = monthly_agg_saved[monthly_agg_saved['sum_sales'] != 0][pd.to_datetime(monthly_agg_saved['Month'], format='%Y-%m') < valid_threshold].copy()
preds = []
for i in range(N_valid_months):
    p = predict_next_month(mag, model)
    p.columns = ['Company_ID', 'Product_ID', 'Month', 'sum_sales']
    preds.append(p)
    mag = pd.concat([mag, p], axis=0)

preds = pd.concat(preds, axis=0)
valid_data = monthly_agg_saved[monthly_agg_saved['sum_sales'] != 0][pd.to_datetime(monthly_agg_saved['Month'], format='%Y-%m') >= valid_threshold].copy()
merged = valid_data.merge(preds, on=['Company_ID', 'Product_ID', 'Month'], suffixes=('_actual', '_pred'))
raw_mae = mean_absolute_error(merged['sum_sales_actual'], merged['sum_sales_pred'])
print("Рекурсивный прогноз - raw MAE (valid):    ", raw_mae)
print("Рекурсивный прогноз - scaled MAE (valid): ", raw_mae * scale_factor)

Рекурсивный прогноз - raw MAE (valid):     27.221712381440657
Рекурсивный прогноз - scaled MAE (valid):  11.096118532380032


In [20]:
full_train = monthly_agg.dropna(subset=['target']).sample(frac=1, random_state=42)
X_full = full_train[full_train['target'] != 0][feature_cols + cat_cols]
y_full = full_train[full_train['target'] != 0]['target']

final_model = CatBoostRegressor(**cb_params, verbose=100)

final_model.fit(
    X_full,
    y_full
)

0:	learn: 43.8930536	total: 435ms	remaining: 12m 5s
100:	learn: 24.0225650	total: 36.7s	remaining: 9m 29s
200:	learn: 22.1550106	total: 1m 9s	remaining: 8m 28s
300:	learn: 21.7281256	total: 1m 43s	remaining: 7m 51s
400:	learn: 21.5870249	total: 2m 14s	remaining: 7m 4s
500:	learn: 21.4904733	total: 2m 45s	remaining: 6m 24s
600:	learn: 20.2740573	total: 3m 16s	remaining: 5m 48s
700:	learn: 19.5676879	total: 3m 46s	remaining: 5m 12s
800:	learn: 18.8215251	total: 4m 16s	remaining: 4m 37s
900:	learn: 18.3008141	total: 4m 45s	remaining: 4m 3s
1000:	learn: 17.5675474	total: 5m 14s	remaining: 3m 29s
1100:	learn: 17.0164253	total: 5m 43s	remaining: 2m 57s
1200:	learn: 16.5967550	total: 6m 12s	remaining: 2m 24s
1300:	learn: 16.2058216	total: 6m 41s	remaining: 1m 53s
1400:	learn: 15.7450479	total: 7m 10s	remaining: 1m 22s
1500:	learn: 15.3417928	total: 7m 38s	remaining: 51s
1600:	learn: 15.1850093	total: 8m 3s	remaining: 20.2s
1667:	learn: 15.0546227	total: 8m 19s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7ad85a80ded0>

In [21]:
sample_submission['Company_ID'] = sample_submission['Id'].apply(lambda x: x.split('_')[0]).astype(int)
sample_submission['Product_ID'] = sample_submission['Id'].apply(lambda x: x.split('_')[1]).astype(int)
sample_submission['Month_pred'] = sample_submission['Id'].apply(lambda x: x.split('_')[2]).astype(str)
sample_submission

Unnamed: 0,Id,Target,Company_ID,Product_ID,Month_pred
0,0_1_2024-01,0,0,1,2024-01
1,0_2_2024-01,0,0,2,2024-01
2,0_3_2024-01,0,0,3,2024-01
3,0_5_2024-01,0,0,5,2024-01
4,0_6_2024-01,0,0,6,2024-01
...,...,...,...,...,...
113851,3_14664_2024-03,0,3,14664,2024-03
113852,3_14665_2024-03,0,3,14665,2024-03
113853,3_14666_2024-03,0,3,14666,2024-03
113854,3_14667_2024-03,0,3,14667,2024-03


In [22]:
preds = []
mag = monthly_agg_saved.copy()
for i in range(3):
    p = predict_next_month(mag, final_model)
    preds.append(p.copy())
    p.columns = ['Company_ID', 'Product_ID', 'Month', 'sum_sales']
    mag = pd.concat([mag, p], axis=0)
preds = pd.concat(preds, axis=0)
sample_submission = sample_submission.drop(columns=['Target'], axis=1).merge(preds, 'left', on=['Company_ID', 'Product_ID', 'Month_pred'])

In [23]:
to_mult = monthly_agg_saved[monthly_agg_saved['Month'] == '2023-12'].groupby(['Company_ID', 'Product_ID'])['sum_sales'].agg(lambda x: int(x.sum() > 0)).reset_index()
for i in tqdm.tqdm(to_mult.index):
    sample_submission.loc[(sample_submission['Company_ID'] == to_mult.loc[i, 'Company_ID']) &
                          (sample_submission['Product_ID'] == to_mult.loc[i, 'Product_ID']), 'Target'] *= to_mult.loc[i, 'sum_sales']

100%|██████████| 57652/57652 [00:33<00:00, 1698.47it/s]


In [24]:
sample_submission[['Id', 'Target']]

Unnamed: 0,Id,Target
0,0_1_2024-01,6.0
1,0_2_2024-01,0.0
2,0_3_2024-01,0.0
3,0_5_2024-01,2.0
4,0_6_2024-01,3.0
...,...,...
113851,3_14664_2024-03,0.0
113852,3_14665_2024-03,199.0
113853,3_14666_2024-03,0.0
113854,3_14667_2024-03,2.0


In [25]:
sample_submission[['Id', 'Target']].to_csv('data/submissions/bruh.csv', index=False)