In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from catboost import CatBoostRegressor
import tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('data/raw/train.csv', parse_dates=['Date'])
sample_submission = pd.read_csv('data/raw/sample_submission.csv')

In [4]:
train['Date'] = pd.to_datetime(train['Date'].apply(lambda x: x[:-2] + '01'), format='%Y-%m-%d')
train['Month'] = train['Date'].dt.to_period('M').astype(str)

In [5]:
monthly_agg = (
    train
    .groupby(['Company_ID', 'Product_ID', 'Month'], as_index=False)
    .agg({'Target': 'sum'})
    .rename(columns={'Target': 'sum_sales'})
    .sort_values(by=['Company_ID', 'Product_ID', 'Month'])
)

In [6]:
d = {'Company_ID': [], 'Product_ID': [], 'Month': []}
ar = monthly_agg['Company_ID'].sort_values().unique()
br = monthly_agg['Product_ID'].sort_values().unique()
cr = monthly_agg['Month'].sort_values().unique()
for company_id in ar:
    for product_id in br:
        for month in cr:
            d['Company_ID'].append(company_id)
            d['Product_ID'].append(product_id)
            d['Month'].append(month)
d = pd.DataFrame(d)
monthly_agg = d.merge(monthly_agg, 'left', on=['Company_ID', 'Product_ID', 'Month']).fillna(0)

In [7]:
monthly_agg_saved = monthly_agg.copy()
monthly_agg

Unnamed: 0,Company_ID,Product_ID,Month,sum_sales
0,0,0,2019-01,0.0
1,0,0,2019-02,0.0
2,0,0,2019-03,0.0
3,0,0,2019-04,0.0
4,0,0,2019-05,0.0
...,...,...,...,...
3459115,3,14668,2023-08,0.0
3459116,3,14668,2023-09,0.0
3459117,3,14668,2023-10,0.0
3459118,3,14668,2023-11,0.0


In [8]:
monthly_agg['Month_date'] = pd.to_datetime(monthly_agg['Month'], format='%Y-%m')

n_lags = 15

subset = []
gb = monthly_agg.groupby(['Company_ID','Product_ID'])
for i in range(n_lags + 1):
    monthly_agg[f'prev_sum_sales_{i}'] = gb['sum_sales'].shift(i)
    subset.append(f'prev_sum_sales_{i}')
    if i > 0:
        feats_to_mean = [f'prev_sum_sales_{j}' for j in range(i + 1)]
        monthly_agg[f'rolling_sum_sales_{i + 1}'] = monthly_agg[feats_to_mean].mean(skipna=False, axis=1)
        subset.append(f'rolling_sum_sales_{i + 1}')

monthly_agg['target'] = gb['sum_sales'].shift(-1)

In [9]:
monthly_agg.dropna(subset=['target'], inplace=True)

In [10]:
monthly_agg

Unnamed: 0,Company_ID,Product_ID,Month,sum_sales,Month_date,prev_sum_sales_0,prev_sum_sales_1,rolling_sum_sales_2,prev_sum_sales_2,rolling_sum_sales_3,...,rolling_sum_sales_12,prev_sum_sales_12,rolling_sum_sales_13,prev_sum_sales_13,rolling_sum_sales_14,prev_sum_sales_14,rolling_sum_sales_15,prev_sum_sales_15,rolling_sum_sales_16,target
0,0,0,2019-01,0.0,2019-01-01,0.0,,,,,...,,,,,,,,,,0.0
1,0,0,2019-02,0.0,2019-02-01,0.0,0.0,0.0,,,...,,,,,,,,,,0.0
2,0,0,2019-03,0.0,2019-03-01,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,0.0
3,0,0,2019-04,0.0,2019-04-01,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,0.0
4,0,0,2019-05,0.0,2019-05-01,0.0,0.0,0.0,0.0,0.000000,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3459114,3,14668,2023-07,0.0,2023-07-01,0.0,0.0,0.0,5.0,1.666667,...,0.416667,0.0,0.384615,0.0,0.357143,0.0,0.333333,0.0,0.3125,0.0
3459115,3,14668,2023-08,0.0,2023-08-01,0.0,0.0,0.0,0.0,0.000000,...,0.416667,0.0,0.384615,0.0,0.357143,0.0,0.333333,0.0,0.3125,0.0
3459116,3,14668,2023-09,0.0,2023-09-01,0.0,0.0,0.0,0.0,0.000000,...,0.416667,0.0,0.384615,0.0,0.357143,0.0,0.333333,0.0,0.3125,0.0
3459117,3,14668,2023-10,0.0,2023-10-01,0.0,0.0,0.0,0.0,0.000000,...,0.416667,0.0,0.384615,0.0,0.357143,0.0,0.333333,0.0,0.3125,0.0


In [11]:
# Сортируем по Month_date (по возрастанию времени)
monthly_agg.sort_values(by=['Month_date'], inplace=True)

# Пример: берём валидацию на последние 3 месяца, которые у нас есть в данных
all_months = sorted(monthly_agg['Month_date'].unique())
N_valid_months = 3
valid_threshold = all_months[-N_valid_months]  # первая из последних 3
train_data = monthly_agg[monthly_agg['Month_date'] < valid_threshold]
valid_data = monthly_agg[monthly_agg['Month_date'] >= valid_threshold]

In [12]:
feature_cols = subset
cat_cols = ['Company_ID', 'Product_ID']

X_train = train_data[train_data['target'] != 0][feature_cols + cat_cols]
y_train = train_data[train_data['target'] != 0]['target']

X_valid = valid_data[valid_data['target'] != 0][feature_cols + cat_cols]
y_valid = valid_data[valid_data['target'] != 0]['target']

# Укажем, какие столбцы считать категориальными для CatBoost
# (CatBoost умеет работать с ними напрямую, выучивая таргет-кодирование)
cat_features_indices = [X_train.columns.get_loc(c) for c in cat_cols]  # индексы категориальных фич

In [13]:
X_train

Unnamed: 0,prev_sum_sales_0,prev_sum_sales_1,rolling_sum_sales_2,prev_sum_sales_2,rolling_sum_sales_3,prev_sum_sales_3,rolling_sum_sales_4,prev_sum_sales_4,rolling_sum_sales_5,prev_sum_sales_5,...,prev_sum_sales_12,rolling_sum_sales_13,prev_sum_sales_13,rolling_sum_sales_14,prev_sum_sales_14,rolling_sum_sales_15,prev_sum_sales_15,rolling_sum_sales_16,Company_ID,Product_ID
168600,1.0,,,,,,,,,,...,,,,,,,,,0,2856
2148720,1.0,,,,,,,,,,...,,,,,,,,,2,7101
2148840,1.0,,,,,,,,,,...,,,,,,,,,2,7103
2148900,2.0,,,,,,,,,,...,,,,,,,,,2,7104
168540,0.0,,,,,,,,,,...,,,,,,,,,0,2855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105235,17.0,3.0,10.0,10.0,10.000000,4.0,8.50,24.0,11.6,0.0,...,0.0,11.076923,2.0,10.428571,0.0,9.733333,0.0,9.1250,0,1785
838735,13.0,35.0,24.0,15.0,21.000000,4.0,16.75,20.0,17.4,5.0,...,4.0,10.076923,33.0,11.714286,2.0,11.066667,0.0,10.3750,0,14226
1627315,0.0,1.0,0.5,2.0,1.000000,0.0,0.75,0.0,0.6,2.0,...,0.0,1.538462,2.0,1.571429,12.0,2.266667,0.0,2.1250,1,12927
909355,100.0,65.0,82.5,2815.0,993.333333,915.0,973.75,350.0,849.0,795.0,...,100.0,715.000000,235.0,680.714286,5650.0,1012.000000,1290.0,1029.3750,1,751


In [14]:
model = CatBoostRegressor(
    iterations=1200,
    learning_rate=0.025,
    depth=6,
    random_seed=42,
    cat_features=cat_features_indices,
    eval_metric='MAE',
    verbose=100
)

model.fit(
    X_train,
    y_train,
    eval_set=(X_valid, y_valid),
    early_stopping_rounds=100
)


0:	learn: 66.4185858	test: 70.8516567	best: 70.8516567 (0)	total: 322ms	remaining: 6m 25s
100:	learn: 26.6596096	test: 29.4855193	best: 29.4855193 (100)	total: 8.9s	remaining: 1m 36s
200:	learn: 23.6764751	test: 26.1237290	best: 26.1237290 (200)	total: 20s	remaining: 1m 39s
300:	learn: 23.0530966	test: 25.7401158	best: 25.7245345 (291)	total: 28.1s	remaining: 1m 23s
400:	learn: 22.6834373	test: 25.5774586	best: 25.5745359 (399)	total: 36.3s	remaining: 1m 12s
500:	learn: 22.3816186	test: 25.4380946	best: 25.4380946 (500)	total: 45.8s	remaining: 1m 3s
600:	learn: 22.1240123	test: 25.3404477	best: 25.3306723 (595)	total: 55.7s	remaining: 55.5s
700:	learn: 21.9233216	test: 25.2825711	best: 25.2780687 (698)	total: 1m 3s	remaining: 45.6s
800:	learn: 21.7294103	test: 25.2111370	best: 25.2039603 (787)	total: 1m 12s	remaining: 36.1s
900:	learn: 21.5765538	test: 25.1754882	best: 25.1749913 (897)	total: 1m 21s	remaining: 26.9s
1000:	learn: 21.4325785	test: 25.1318214	best: 25.1318214 (1000)	total

<catboost.core.CatBoostRegressor at 0x760f0c5fd790>

In [15]:
from sklearn.metrics import mean_absolute_error
y_pred_valid = model.predict(X_valid)
mae_valid = mean_absolute_error(y_valid, y_pred_valid)
print("MAE на валидации:", mae_valid)

MAE на валидации: 25.042196022707554


In [17]:
full_train = monthly_agg.dropna(subset=['target'])
X_full = full_train[full_train['target'] != 0][feature_cols + cat_cols]
y_full = full_train[full_train['target'] != 0]['target']

final_model = CatBoostRegressor(
    iterations=1200,
    learning_rate=0.025,
    depth=6,
    random_seed=42,
    cat_features=cat_features_indices,
    eval_metric='MAE',
    verbose=100
)

final_model.fit(
    X_full,
    y_full
)

0:	learn: 66.8729060	total: 66.3ms	remaining: 1m 19s
100:	learn: 26.8191450	total: 9.12s	remaining: 1m 39s
200:	learn: 23.7746674	total: 18.1s	remaining: 1m 30s
300:	learn: 23.0402441	total: 25.6s	remaining: 1m 16s
400:	learn: 22.6848189	total: 34.7s	remaining: 1m 9s
500:	learn: 22.4006718	total: 44.2s	remaining: 1m 1s
600:	learn: 22.1717536	total: 55s	remaining: 54.8s
700:	learn: 21.9952102	total: 1m 4s	remaining: 46.2s
800:	learn: 21.8176105	total: 1m 15s	remaining: 37.7s
900:	learn: 21.6507828	total: 1m 26s	remaining: 28.8s
1000:	learn: 21.5046625	total: 1m 39s	remaining: 19.7s
1100:	learn: 21.3819727	total: 1m 49s	remaining: 9.82s
1199:	learn: 21.2596023	total: 1m 58s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x760f0c5c4e50>

In [18]:
def predict_next_month(df):
    df = df.copy()
    df['Month_date'] = pd.to_datetime(df['Month'], format='%Y-%m')
    df['Month_pred'] = (df['Month_date'] + pd.offsets.MonthBegin(1)).dt.to_period('M').astype(str)
    df = df.sort_values(by='Month_date')

    gb = df.groupby(['Company_ID','Product_ID'])
    for i in range(n_lags + 1):
        df[f'prev_sum_sales_{i}'] = gb['sum_sales'].shift(i)
        if i > 0:
            feats_to_mean = [f'prev_sum_sales_{j}' for j in range(i + 1)]
            df[f'rolling_sum_sales_{i + 1}'] = df[feats_to_mean].mean(skipna=False, axis=1)

    df['Target'] = gb['sum_sales'].shift(-1)

    to_predict = df['Target'].isna()
    df.loc[to_predict, 'Target'] = model.predict(df.loc[to_predict, feature_cols + cat_cols])
    return df.loc[to_predict, ['Company_ID', 'Product_ID', 'Month_pred', 'Target']]

In [19]:
sample_submission['Company_ID'] = sample_submission['Id'].apply(lambda x: x.split('_')[0]).astype(int)
sample_submission['Product_ID'] = sample_submission['Id'].apply(lambda x: x.split('_')[1]).astype(int)
sample_submission['Month_pred'] = sample_submission['Id'].apply(lambda x: x.split('_')[2]).astype(str)
sample_submission

Unnamed: 0,Id,Target,Company_ID,Product_ID,Month_pred
0,0_1_2024-01,0,0,1,2024-01
1,0_2_2024-01,0,0,2,2024-01
2,0_3_2024-01,0,0,3,2024-01
3,0_5_2024-01,0,0,5,2024-01
4,0_6_2024-01,0,0,6,2024-01
...,...,...,...,...,...
113851,3_14664_2024-03,0,3,14664,2024-03
113852,3_14665_2024-03,0,3,14665,2024-03
113853,3_14666_2024-03,0,3,14666,2024-03
113854,3_14667_2024-03,0,3,14667,2024-03


In [20]:
preds = []
mag = monthly_agg_saved.copy()
for i in range(3):
    p = predict_next_month(mag)
    preds.append(p.copy())

    p.columns = ['Company_ID', 'Product_ID', 'Month', 'sum_sales']
    mag = pd.concat([mag, p], axis=0)
preds = pd.concat(preds, axis=0)
sample_submission = sample_submission.drop(columns=['Target'], axis=1).merge(preds, 'left', on=['Company_ID', 'Product_ID', 'Month_pred'])

In [21]:
to_mult = monthly_agg_saved[monthly_agg_saved['Month'] == '2023-12'].groupby(['Company_ID', 'Product_ID'])['sum_sales'].agg(lambda x: int(x.sum() > 0)).reset_index()
for i in tqdm.tqdm(to_mult.index):
    sample_submission.loc[(sample_submission['Company_ID'] == to_mult.loc[i, 'Company_ID']) &
                          (sample_submission['Product_ID'] == to_mult.loc[i, 'Product_ID']), 'Target'] *= to_mult.loc[i, 'sum_sales']

100%|██████████| 57652/57652 [00:34<00:00, 1692.20it/s]


In [22]:
sample_submission[['Id', 'Target']]

Unnamed: 0,Id,Target
0,0_1_2024-01,8.035541
1,0_2_2024-01,0.000000
2,0_3_2024-01,0.000000
3,0_5_2024-01,1.823280
4,0_6_2024-01,4.035948
...,...,...
113851,3_14664_2024-03,0.000000
113852,3_14665_2024-03,185.074151
113853,3_14666_2024-03,0.000000
113854,3_14667_2024-03,0.105386


In [23]:
sample_submission[['Id', 'Target']].to_csv('data/submissions/bruh.csv', index=False)