In [106]:
import pandas as pd
import numpy as np
import datetime as dt
from catboost import CatBoostRegressor
import tqdm

In [107]:
import warnings
warnings.filterwarnings('ignore')

In [108]:
train = pd.read_csv('data/raw/train.csv', parse_dates=['Date'])
sample_submission = pd.read_csv('data/raw/sample_submission.csv')

In [109]:
train['Date'] = pd.to_datetime(train['Date'].apply(lambda x: x[:-2] + '01'), format='%Y-%m-%d')
train['Month'] = train['Date'].dt.to_period('M').astype(str)

In [110]:
monthly_agg = (
    train
    .groupby(['Company_ID', 'Product_ID', 'Month'], as_index=False)
    .agg({'Target': 'sum'})
    .rename(columns={'Target': 'sum_sales'})
    .sort_values(by=['Company_ID', 'Product_ID', 'Month'])
)
monthly_agg_saved = monthly_agg.copy()
monthly_agg

Unnamed: 0,Company_ID,Product_ID,Month,sum_sales
0,0,1,2019-05,2
1,0,1,2019-06,13
2,0,1,2019-07,1
3,0,1,2019-09,30
4,0,1,2019-11,1
...,...,...,...,...
862331,3,14668,2021-03,1
862332,3,14668,2021-04,1
862333,3,14668,2021-05,1
862334,3,14668,2021-06,1


In [111]:
monthly_agg['Month_date'] = pd.to_datetime(monthly_agg['Month'], format='%Y-%m')

n_lags = 3

subset = []
gb = monthly_agg.groupby(['Company_ID','Product_ID'])
for i in range(n_lags + 1):
    monthly_agg[f'prev_sum_sales_{i}'] = gb['sum_sales'].shift(i)
    subset.append(f'prev_sum_sales_{i}')

monthly_agg['target'] = gb['sum_sales'].shift(-1)

In [112]:
# Убираем строки, где lag или target недоступны
monthly_agg.dropna(subset=['target'], inplace=True)

In [113]:
monthly_agg

Unnamed: 0,Company_ID,Product_ID,Month,sum_sales,Month_date,prev_sum_sales_0,prev_sum_sales_1,prev_sum_sales_2,prev_sum_sales_3,target
0,0,1,2019-05,2,2019-05-01,2,,,,13.0
1,0,1,2019-06,13,2019-06-01,13,2.0,,,1.0
2,0,1,2019-07,1,2019-07-01,1,13.0,2.0,,30.0
3,0,1,2019-09,30,2019-09-01,30,1.0,13.0,2.0,1.0
4,0,1,2019-11,1,2019-11-01,1,30.0,1.0,13.0,4.0
...,...,...,...,...,...,...,...,...,...,...
862330,3,14668,2021-01,1,2021-01-01,1,1.0,1.0,14.0,1.0
862331,3,14668,2021-03,1,2021-03-01,1,1.0,1.0,1.0,1.0
862332,3,14668,2021-04,1,2021-04-01,1,1.0,1.0,1.0,1.0
862333,3,14668,2021-05,1,2021-05-01,1,1.0,1.0,1.0,1.0


In [114]:
# Сортируем по Month_date (по возрастанию времени)
monthly_agg.sort_values(by=['Month_date'], inplace=True)

# Пример: берём валидацию на последние 3 месяца, которые у нас есть в данных
all_months = sorted(monthly_agg['Month_date'].unique())
N_valid_months = 3
valid_threshold = all_months[-N_valid_months]  # первая из последних 3
train_data = monthly_agg[monthly_agg['Month_date'] < valid_threshold]
valid_data = monthly_agg[monthly_agg['Month_date'] >= valid_threshold]

In [115]:
feature_cols = subset
cat_cols = ['Company_ID', 'Product_ID']

X_train = train_data[feature_cols + cat_cols]
y_train = train_data['target']

X_valid = valid_data[feature_cols + cat_cols]
y_valid = valid_data['target']

# Укажем, какие столбцы считать категориальными для CatBoost
# (CatBoost умеет работать с ними напрямую, выучивая таргет-кодирование)
cat_features_indices = [X_train.columns.get_loc(c) for c in cat_cols]  # индексы категориальных фич


In [116]:
X_train

Unnamed: 0,prev_sum_sales_0,prev_sum_sales_1,prev_sum_sales_2,prev_sum_sales_3,Company_ID,Product_ID
407392,11,,,,2,4778
320744,1,,,,1,12248
503616,3,,,,2,11979
656968,3,,,,3,5344
183447,4,,,,0,13884
...,...,...,...,...,...,...
377466,200,20.0,400.0,20.0,2,2500
500037,4,1.0,5.0,3.0,2,11735
141922,12,13.0,20.0,18.0,0,10807
811333,43,44.0,49.0,47.0,3,12262


In [117]:
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    random_seed=42,
    cat_features=cat_features_indices,
    eval_metric='MAE',
    verbose=100
)

model.fit(
    X_train,
    y_train,
    eval_set=(X_valid, y_valid),
    early_stopping_rounds=100
)


0:	learn: 66.7292668	test: 76.2200291	best: 76.2200291 (0)	total: 64ms	remaining: 1m 3s
100:	learn: 25.6072956	test: 32.9161888	best: 32.9161888 (100)	total: 7.57s	remaining: 1m 7s
200:	learn: 23.5996838	test: 30.8658325	best: 30.8658325 (200)	total: 14.3s	remaining: 56.7s
300:	learn: 23.2363765	test: 30.3804563	best: 30.3804563 (300)	total: 23.2s	remaining: 53.9s
400:	learn: 22.9780556	test: 30.1042413	best: 30.1042413 (400)	total: 30.8s	remaining: 45.9s
500:	learn: 22.7827339	test: 29.9306337	best: 29.9306337 (500)	total: 37.8s	remaining: 37.6s
600:	learn: 22.5809005	test: 29.7885368	best: 29.7885368 (600)	total: 45.5s	remaining: 30.2s
700:	learn: 22.3886782	test: 29.6150869	best: 29.6150869 (700)	total: 53.4s	remaining: 22.8s
800:	learn: 22.2314787	test: 29.5090691	best: 29.5086052 (798)	total: 1m 2s	remaining: 15.4s
900:	learn: 22.0894504	test: 29.4220126	best: 29.4202676 (897)	total: 1m 10s	remaining: 7.75s
999:	learn: 21.9623086	test: 29.3396533	best: 29.3396533 (999)	total: 1m 1

<catboost.core.CatBoostRegressor at 0x7b742bbee150>

In [118]:
from sklearn.metrics import mean_absolute_error
y_pred_valid = model.predict(X_valid)
mae_valid = mean_absolute_error(y_valid, y_pred_valid)
print("MAE на валидации:", mae_valid)

MAE на валидации: 29.339654276747442


In [119]:
full_train = monthly_agg.dropna(subset=['target'])
X_full = full_train[feature_cols + cat_cols]
y_full = full_train['target']

final_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    random_seed=42,
    cat_features=cat_features_indices,
    eval_metric='MAE',
    verbose=100
)

final_model.fit(
    X_full,
    y_full
)

0:	learn: 67.6297272	total: 76.5ms	remaining: 1m 16s
100:	learn: 25.9901297	total: 8.93s	remaining: 1m 19s
200:	learn: 23.9328398	total: 16.6s	remaining: 1m 6s
300:	learn: 23.5737622	total: 24.4s	remaining: 56.7s
400:	learn: 23.3087658	total: 37.1s	remaining: 55.4s
500:	learn: 23.0673041	total: 45.5s	remaining: 45.4s
600:	learn: 22.8583145	total: 52.9s	remaining: 35.1s
700:	learn: 22.6363784	total: 1m 3s	remaining: 27s
800:	learn: 22.4666846	total: 1m 12s	remaining: 18.1s
900:	learn: 22.3178171	total: 1m 23s	remaining: 9.17s
999:	learn: 22.2040280	total: 1m 34s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7b74247b65d0>

In [120]:
def predict_next_month(company_id, product_id, df):
    df = df.copy()
    df['Month_date'] = pd.to_datetime(df['Month'], format='%Y-%m')
    df['Month_pred'] = (df['Month_date'] + pd.offsets.MonthBegin(1)).dt.to_period('M').astype(str)
    df = df[(df['Company_ID'] == company_id) & (df['Product_ID'] == product_id)].sort_values(by='Month_date')

    gb = df.groupby(['Company_ID','Product_ID'])
    for i in range(n_lags + 1):
        df[f'prev_sum_sales_{i}'] = gb['sum_sales'].shift(i)

    df['Target'] = gb['sum_sales'].shift(-1)

    df['Target'].iloc[-1] = model.predict(df[feature_cols + cat_cols].iloc[-1:])
    return df[['Company_ID', 'Product_ID', 'Month_pred', 'Target']].iloc[-1:]

In [121]:
predict_next_month(0, 1, monthly_agg_saved)

Unnamed: 0,Company_ID,Product_ID,Month_pred,Target
50,0,1,2024-01,9.227565


In [122]:
sample_submission['Company_ID'] = sample_submission['Id'].apply(lambda x: x.split('_')[0]).astype(int)
sample_submission['Product_ID'] = sample_submission['Id'].apply(lambda x: x.split('_')[1]).astype(int)

sub = pd.DataFrame(columns=['Id', 'Target'])
for i in range(3):
    for company_id in sample_submission['Company_ID'].unique():
        for product_id in tqdm.tqdm(sample_submission['Product_ID'].unique()):
            p = predict_next_month(company_id, product_id, monthly_agg_saved)
            nrow = pd.DataFrame({
                'Company_ID': [p['Company_ID'].iloc[0]], 
                'Product_ID': [p['Product_ID'].iloc[0]], 
                'Month': [p['Month_pred'].iloc[0]], 
                'sum_sales': [p['Target'].iloc[0]]
            })
            monthly_agg_saved = pd.concat([monthly_agg_saved, nrow])
            nrow = pd.DataFrame({
                'Id': [f"{p['Company_ID'].iloc[0]}_{p['Product_ID'].iloc[0]}_{p['Month_pred'].iloc[0]}"], 
                'Target': [p['Target'].iloc[0]]
            })
            sub = pd.concat([sub, nrow])

  0%|          | 56/14413 [00:16<1:08:33,  3.49it/s]


KeyboardInterrupt: 

In [123]:
sub

Unnamed: 0,Id,Target
0,0_1_2024-01,9.227565
0,0_2_2023-12,1.79084
0,0_3_2023-08,2.101596
0,0_5_2024-01,2.503293
0,0_6_2024-01,3.965903
0,0_10_2020-12,0.990542
0,0_11_2023-02,7.746017
0,0_12_2023-11,1.935926
0,0_13_2022-01,2.076064
0,0_15_2024-01,319.219552


In [None]:
sub.to_csv('data/submissions/bruh.csv', index=False)