In [1]:
import pandas as pd
import numpy as np
from datetime import date
import holidays
from prophet import Prophet

import datetime
from datetime import timedelta
from statsmodels.tsa.seasonal import seasonal_decompose
from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor


from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, GridSearchCV

tscv = TimeSeriesSplit(n_splits=5)
state = np.random.RandomState(12345)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Вкидываем праздники, для их учёта моделькой
holidays_dict = holidays.RUS(years=(2022, 2023))
df_holidays = pd.DataFrame.from_dict(holidays_dict, orient='index') \
    .reset_index()
df_holidays = df_holidays.rename({'index':'ds', 0:'holiday'}, axis ='columns')
df_holidays['ds'] = pd.to_datetime(df_holidays.ds)
df_holidays = df_holidays.sort_values(by=['ds'])
df_holidays = df_holidays.reset_index(drop=True)

In [3]:
pr_df = pd.read_csv('D:\data_science\demand_forecast_for_products\sp_sales_task\pr_df.csv')
sales_df = pd.read_csv('D:\data_science\demand_forecast_for_products\sp_sales_task\sales_df_train.csv')
sales_submission = pd.read_csv('D:\data_science\demand_forecast_for_products\sp_sales_task\sales_submission.csv')
st_df = pd.read_csv('D:\data_science\demand_forecast_for_products\sp_sales_task\st_df.csv')

In [4]:
# Соединяем все таблицы
df_1 = pd.merge(sales_df, st_df, how='left', left_on='st_id', right_on='st_id')
df = pd.merge(df_1, pr_df, how='left', left_on='pr_sku_id', right_on='pr_sku_id')

In [5]:
df = df.rename(columns = {'pr_sales_in_rub' : 'y'})
df['ds'] = pd.to_datetime(df['date'])

In [6]:
df['fe_st_id'] = df['st_id'].map(df['st_id'].value_counts(normalize=False))
df['fe_pr_sku_id'] = df['pr_sku_id'].map(df['pr_sku_id'].value_counts(normalize=False))
df['fe_st_city_id'] = df['st_city_id'].map(df['st_city_id'].value_counts(normalize=False))
df['fe_st_division_code'] = df['st_division_code'].map(df['st_division_code'].value_counts(normalize=False))
df['fe_pr_group_id'] = df['pr_group_id'].map(df['pr_group_id'].value_counts(normalize=False))
df['fe_pr_cat_id'] = df['pr_cat_id'].map(df['pr_cat_id'].value_counts(normalize=False))
df['fe_pr_subcat_id'] = df['pr_subcat_id'].map(df['pr_subcat_id'].value_counts(normalize=False))

In [7]:
df = df.drop(['st_id', 'pr_sku_id', 'st_city_id',
              'st_division_code', 'pr_group_id',
                  'pr_cat_id', 'pr_subcat_id',
                  'pr_promo_sales_in_units',
                   'pr_promo_sales_in_rub', 'date'], axis=1)

In [8]:
date_lag = 15
predictions_period = df['ds'].max() - timedelta(date_lag)
train = df.loc[df['ds'] < predictions_period]
test = df.loc[df['ds'] >= predictions_period]

In [9]:
print(train['ds'].max())
print(train['ds'].min())
print(test['ds'].max())
print(test['ds'].min())

2023-07-02 00:00:00
2022-08-01 00:00:00
2023-07-18 00:00:00
2023-07-03 00:00:00


In [10]:
X_train = train.drop(['y', 'ds'], axis=1)
y_train = train['y']

In [11]:
model_cbr = CatBoostRegressor().fit(X_train, y_train, verbose=False)

In [12]:
a = set(test['fe_st_id'])
b = set(test['fe_pr_sku_id'])

In [13]:
%%time
sales_submission_test = pd.DataFrame(columns=['st_id', 'pr_sku_id', 'date'])
for i in a:
    for j in b:
        test_pred = test[(test['fe_st_id'] == i) & (test['fe_pr_sku_id'] == j)]
       
        df_1 = pd.DataFrame(columns=['st_id', 'pr_sku_id', 'date', 'y'])

        date_lag = 15
        predictions_period = test_pred['ds'].max() - timedelta(date_lag)
        train_1 = test_pred.loc[test_pred['ds'] < predictions_period]
        test_1 = test_pred.loc[test_pred['ds'] >= predictions_period]

        df_1['st_id'], df_1['pr_sku_id'], df_1['date'], df_1['y'] = \
            test_pred['fe_st_id'], test_pred['fe_pr_sku_id'], test_pred['ds'], test_pred['y']
       
        X_test = test_pred.drop(['y', 'ds'], axis=1)

        df_1['target'] = model_cbr.predict(X_test)

        sales_submission_test = pd.concat([sales_submission_test, df_1], ignore_index=False)



CPU times: total: 11.5 s
Wall time: 25.7 s


In [14]:
sales_submission_test.groupby('date').count()

Unnamed: 0_level_0,st_id,pr_sku_id,y,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-07-03,2451,2451,2451,2451
2023-07-04,2448,2448,2448,2448
2023-07-05,2529,2529,2529,2529
2023-07-06,2666,2666,2666,2666
2023-07-07,2983,2983,2983,2983
2023-07-08,2869,2869,2869,2869
2023-07-09,2625,2625,2625,2625
2023-07-10,2432,2432,2432,2432
2023-07-11,2513,2513,2513,2513
2023-07-12,2525,2525,2525,2525


In [15]:
# Вычисляем метрику
wape = 100 * (sales_submission_test['y'] - sales_submission_test['target']).abs().sum() / sales_submission_test['y'].sum()
wape

15.501574987487423

In [16]:
sales_submission_test.reset_index(drop= True , inplace= True )
sales_submission_test

Unnamed: 0,st_id,pr_sku_id,date,y,target
0,157542,2049,2023-07-06,3094.0,3117.066570
1,157542,2049,2023-07-08,394.0,421.750576
2,157542,2049,2023-07-05,3311.0,3413.763146
3,157542,2049,2023-07-05,826.0,734.985074
4,157542,2049,2023-07-13,1047.0,980.680921
...,...,...,...,...,...
41776,174687,2047,2023-07-06,231.0,193.250107
41777,174687,2047,2023-07-15,360.0,264.882488
41778,174687,2047,2023-07-03,31.0,-4.443264
41779,174687,2047,2023-07-17,62.0,40.513275


In [17]:
sales_submission = sales_submission_test.drop(['y'], axis=1)

In [18]:
sales_submission.to_csv('sales_submission.csv')