In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import time

from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
test_parameters = {"n_estimators": 1500, "max_depth": 7, "learning_rate":0.1}

In [3]:
test_df = pd.read_csv('test.csv')
test_df.head(5)

Unnamed: 0,id,date,city_name,store_id,category_id,product_id,price,weather_desc,humidity,temperature,pressure
0,666677,2022-02-14,Москва,1,1,1,4.79,облачно,87.3125,-1.9375,749.3125
1,666678,2022-02-15,Москва,1,1,1,4.79,переменная облачность,88.75,-1.25,752.6875
2,666679,2022-02-16,Москва,1,1,1,4.79,переменная облачность,90.375,-1.5625,746.3125
3,666680,2022-02-17,Москва,1,1,1,4.79,"облачно, небольшой дождь",98.0,1.75,732.6875
4,666681,2022-02-18,Москва,1,1,1,4.79,"облачно, небольшие осадки",95.5,1.375,733.0


In [4]:
train_df = pd.read_csv('train.csv')
train_df.tail(5)

Unnamed: 0,id,date,city_name,store_id,category_id,product_id,price,weather_desc,humidity,temperature,pressure,sales
666671,666672,2022-02-09,Воронеж,164,1,35,2.48,"облачно, небольшой снег",89.125,-4.3125,744.6875,11
666672,666673,2022-02-10,Воронеж,164,1,35,2.48,"облачно, небольшие осадки",92.5625,-0.9375,746.3125,17
666673,666674,2022-02-11,Воронеж,164,1,35,2.48,осадки,99.6875,-0.3125,745.3125,2
666674,666675,2022-02-12,Воронеж,164,1,35,2.48,"переменная облачность, небольшие осадки",91.875,-2.25,749.9375,7
666675,666676,2022-02-13,Воронеж,164,1,35,2.48,переменная облачность,85.9375,-4.875,755.3125,18


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666676 entries, 0 to 666675
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            666676 non-null  int64  
 1   date          666676 non-null  object 
 2   city_name     666676 non-null  object 
 3   store_id      666676 non-null  int64  
 4   category_id   666676 non-null  int64  
 5   product_id    666676 non-null  int64  
 6   price         666676 non-null  float64
 7   weather_desc  666676 non-null  object 
 8   humidity      666676 non-null  float64
 9   temperature   666676 non-null  float64
 10  pressure      666676 non-null  float64
 11  sales         666676 non-null  int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 61.0+ MB


In [6]:
train_df['city_name'].value_counts()

Москва             104262
Санкт-Петербург     89422
Самара              76643
Нижний Новгород     75722
Казань              72550
Ростов-на-Дону      67577
Воронеж             50942
Волгоград           46281
Краснодар           44611
Екатеринбург        38666
Name: city_name, dtype: int64

In [7]:
test_df['city_name'].value_counts()

Москва             3815
Санкт-Петербург    3346
Самара             2940
Нижний Новгород    2849
Казань             2632
Ростов-на-Дону     2471
Воронеж            1953
Волгоград          1687
Краснодар          1631
Екатеринбург       1512
Name: city_name, dtype: int64

In [8]:
train_df['date'] = pd.to_datetime(train_df['date'])

In [9]:
train_df['weekday'] = train_df['date'].dt.weekday

In [10]:
test_df['date'] = pd.to_datetime(test_df['date'])
test_df['weekday'] = test_df['date'].dt.weekday 

In [11]:
new_df = train_df.copy()
new_df

Unnamed: 0,id,date,city_name,store_id,category_id,product_id,price,weather_desc,humidity,temperature,pressure,sales,weekday
0,1,2021-07-29,Москва,1,1,1,4.79,"переменная облачность, небольшой дождь",61.9375,23.1875,741.0000,26,3
1,2,2021-07-30,Москва,1,1,1,4.79,"переменная облачность, небольшой дождь",70.2500,22.1875,740.3125,37,4
2,3,2021-07-31,Москва,1,1,1,4.79,переменная облачность,52.6250,21.8125,741.6250,25,5
3,4,2021-08-01,Москва,1,1,1,4.79,"облачно, небольшой дождь",87.4375,20.0625,743.3125,26,6
4,5,2021-08-02,Москва,1,1,1,4.79,переменная облачность,66.1875,23.4375,739.6250,22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
666671,666672,2022-02-09,Воронеж,164,1,35,2.48,"облачно, небольшой снег",89.1250,-4.3125,744.6875,11,2
666672,666673,2022-02-10,Воронеж,164,1,35,2.48,"облачно, небольшие осадки",92.5625,-0.9375,746.3125,17,3
666673,666674,2022-02-11,Воронеж,164,1,35,2.48,осадки,99.6875,-0.3125,745.3125,2,4
666674,666675,2022-02-12,Воронеж,164,1,35,2.48,"переменная облачность, небольшие осадки",91.8750,-2.2500,749.9375,7,5


In [12]:
new_df = new_df.append(test_df, ignore_index = True)

In [13]:
new_df['weather_desc'].value_counts()

переменная облачность                      267674
облачно, небольшой снег                     90042
облачно, небольшой дождь                    81610
облачно                                     73643
ясно                                        46393
облачно, небольшие осадки                   27128
облачно, без существенных осадков           22800
переменная облачность, небольшой дождь      22678
снег                                        22054
дождь                                       10574
переменная облачность, небольшой снег        8692
дождь, гроза                                 6553
метель                                       3998
осадки                                       3700
переменная облачность, небольшие осадки      2859
переменная облачность, дождь                 1114
Name: weather_desc, dtype: int64

In [14]:
def weather_transform(weather_desc):
    if ((weather_desc.find("снег") != -1) | (weather_desc.find("дождь") != -1) | (weather_desc.find("осадки") != -1)):
        return 1
    else:
        return 0

In [15]:
new_df['residue'] = new_df['weather_desc'].apply(weather_transform)

In [16]:
new_df['weekend'] = new_df['weekday'].isin([5, 6]).astype(int)

In [17]:
group = new_df.groupby(['store_id', 'product_id'])
for i in range(7, 22):
    new_df[f'lag_day_{i}'] = group['sales'].shift(i)

In [18]:
new_df = new_df.drop(['lag_day_18', 'lag_day_10', 'lag_day_12', 'lag_day_13', 'lag_day_15'], axis = 1)
new_df = new_df.drop(['lag_day_8', 'lag_day_16', 'lag_day_17', 'lag_day_19', 'lag_day_20'], axis = 1)

In [19]:
new_df

Unnamed: 0,id,date,city_name,store_id,category_id,product_id,price,weather_desc,humidity,temperature,pressure,sales,weekday,residue,weekend,lag_day_7,lag_day_9,lag_day_11,lag_day_14,lag_day_21
0,1,2021-07-29,Москва,1,1,1,4.79,"переменная облачность, небольшой дождь",61.9375,23.1875,741.0000,26.0,3,1,0,,,,,
1,2,2021-07-30,Москва,1,1,1,4.79,"переменная облачность, небольшой дождь",70.2500,22.1875,740.3125,37.0,4,1,0,,,,,
2,3,2021-07-31,Москва,1,1,1,4.79,переменная облачность,52.6250,21.8125,741.6250,25.0,5,0,1,,,,,
3,4,2021-08-01,Москва,1,1,1,4.79,"облачно, небольшой дождь",87.4375,20.0625,743.3125,26.0,6,1,1,,,,,
4,5,2021-08-02,Москва,1,1,1,4.79,переменная облачность,66.1875,23.4375,739.6250,22.0,0,0,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691507,691508,2022-02-16,Воронеж,164,1,35,2.48,переменная облачность,91.8125,-2.5625,751.0000,,2,0,0,11.0,14.0,8.0,16.0,5.0
691508,691509,2022-02-17,Воронеж,164,1,35,2.48,"облачно, небольшой дождь",93.5000,2.0625,740.0000,,3,1,0,17.0,13.0,19.0,19.0,19.0
691509,691510,2022-02-18,Воронеж,164,1,35,2.48,облачно,96.5625,2.0625,736.0000,,4,0,0,2.0,11.0,14.0,11.0,12.0
691510,691511,2022-02-19,Воронеж,164,1,35,2.48,переменная облачность,89.9375,2.7500,743.0000,,5,0,1,7.0,17.0,13.0,8.0,17.0


In [20]:
categories = ['city_name', 'store_id', 'category_id', 'product_id', 'weekday', 'weather_desc', 'residue', 'weekend']

for feature in categories:
    new_df[feature] = pd.Categorical(new_df[feature])

In [21]:
new_df_dummies = pd.get_dummies(new_df, columns=categories)

In [22]:
new_df.iloc[666675:666678]

Unnamed: 0,id,date,city_name,store_id,category_id,product_id,price,weather_desc,humidity,temperature,pressure,sales,weekday,residue,weekend,lag_day_7,lag_day_9,lag_day_11,lag_day_14,lag_day_21
666675,666676,2022-02-13,Воронеж,164,1,35,2.48,переменная облачность,85.9375,-4.875,755.3125,18.0,6,0,1,19.0,11.0,16.0,7.0,9.0
666676,666677,2022-02-14,Москва,1,1,1,4.79,облачно,87.3125,-1.9375,749.3125,,0,0,0,22.0,57.0,27.0,20.0,36.0
666677,666678,2022-02-15,Москва,1,1,1,4.79,переменная облачность,88.75,-1.25,752.6875,,1,0,0,27.0,49.0,41.0,30.0,31.0


In [23]:
train = new_df_dummies.iloc[0:666676]
X = train.drop(['date', 'sales', 'id'], axis = 1)
y = train['sales']

In [24]:
train

Unnamed: 0,id,date,price,humidity,temperature,pressure,sales,lag_day_7,lag_day_9,lag_day_11,...,"weather_desc_переменная облачность, дождь","weather_desc_переменная облачность, небольшие осадки","weather_desc_переменная облачность, небольшой дождь","weather_desc_переменная облачность, небольшой снег",weather_desc_снег,weather_desc_ясно,residue_0,residue_1,weekend_0,weekend_1
0,1,2021-07-29,4.79,61.9375,23.1875,741.0000,26.0,,,,...,0,0,1,0,0,0,0,1,1,0
1,2,2021-07-30,4.79,70.2500,22.1875,740.3125,37.0,,,,...,0,0,1,0,0,0,0,1,1,0
2,3,2021-07-31,4.79,52.6250,21.8125,741.6250,25.0,,,,...,0,0,0,0,0,0,1,0,0,1
3,4,2021-08-01,4.79,87.4375,20.0625,743.3125,26.0,,,,...,0,0,0,0,0,0,0,1,0,1
4,5,2021-08-02,4.79,66.1875,23.4375,739.6250,22.0,,,,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666671,666672,2022-02-09,2.48,89.1250,-4.3125,744.6875,11.0,16.0,18.0,17.0,...,0,0,0,0,0,0,0,1,1,0
666672,666673,2022-02-10,2.48,92.5625,-0.9375,746.3125,17.0,19.0,11.0,7.0,...,0,0,0,0,0,0,0,1,1,0
666673,666674,2022-02-11,2.48,99.6875,-0.3125,745.3125,2.0,11.0,16.0,18.0,...,0,0,0,0,0,0,0,1,1,0
666674,666675,2022-02-12,2.48,91.8750,-2.2500,749.9375,7.0,8.0,19.0,11.0,...,0,1,0,0,0,0,0,1,0,1


In [25]:
dataset = train_test_split(X, y, test_size=0.25, random_state=0) 

In [26]:
X_train, X_test, y_train, y_test = dataset

In [27]:
cbr_boost = CatBoostRegressor(**test_parameters, verbose=0)

In [28]:
X_test_final = new_df_dummies.iloc[666676:]
X_test_final_id = X_test_final.copy()
X_test_final = X_test_final.drop(['date', 'sales', 'id'], axis = 1)

In [29]:
X_test_final_id

Unnamed: 0,id,date,price,humidity,temperature,pressure,sales,lag_day_7,lag_day_9,lag_day_11,...,"weather_desc_переменная облачность, дождь","weather_desc_переменная облачность, небольшие осадки","weather_desc_переменная облачность, небольшой дождь","weather_desc_переменная облачность, небольшой снег",weather_desc_снег,weather_desc_ясно,residue_0,residue_1,weekend_0,weekend_1
666676,666677,2022-02-14,4.79,87.3125,-1.9375,749.3125,,22.0,57.0,27.0,...,0,0,0,0,0,0,1,0,1,0
666677,666678,2022-02-15,4.79,88.7500,-1.2500,752.6875,,27.0,49.0,41.0,...,0,0,0,0,0,0,1,0,1,0
666678,666679,2022-02-16,4.79,90.3750,-1.5625,746.3125,,26.0,22.0,57.0,...,0,0,0,0,0,0,1,0,1,0
666679,666680,2022-02-17,4.79,98.0000,1.7500,732.6875,,24.0,27.0,49.0,...,0,0,0,0,0,0,0,1,1,0
666680,666681,2022-02-18,4.79,95.5000,1.3750,733.0000,,42.0,26.0,22.0,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
691507,691508,2022-02-16,2.48,91.8125,-2.5625,751.0000,,11.0,14.0,8.0,...,0,0,0,0,0,0,1,0,1,0
691508,691509,2022-02-17,2.48,93.5000,2.0625,740.0000,,17.0,13.0,19.0,...,0,0,0,0,0,0,0,1,1,0
691509,691510,2022-02-18,2.48,96.5625,2.0625,736.0000,,2.0,11.0,14.0,...,0,0,0,0,0,0,1,0,1,0
691510,691511,2022-02-19,2.48,89.9375,2.7500,743.0000,,7.0,17.0,13.0,...,0,0,0,0,0,0,1,0,0,1


In [30]:
X_test_final_id['date'].value_counts()

2022-02-15    3548
2022-02-18    3548
2022-02-14    3548
2022-02-17    3548
2022-02-20    3548
2022-02-16    3548
2022-02-19    3548
Name: date, dtype: int64

In [31]:
time_start = time.time()
cbr_boost.fit(X_train, y_train)
time_finish = time.time()

pred_train = cbr_boost.predict(X_train)
pred_test = cbr_boost.predict(X_test)
pred_test_final = cbr_boost.predict(X_test_final)
mae_train = mean_absolute_error(pred_train, y_train)
mae_test = mean_absolute_error(pred_test, y_test)

print(mae_train, mae_test)

3.3386660842468934 3.4799842160983983


In [32]:
pred_test_final

array([25.26122028, 28.09035956, 23.99996777, ..., 11.48978397,
       13.90137817, 14.78830107])

In [33]:
X_test_final_id = X_test_final_id.reset_index()

In [34]:
X_test_final_id

Unnamed: 0,index,id,date,price,humidity,temperature,pressure,sales,lag_day_7,lag_day_9,...,"weather_desc_переменная облачность, дождь","weather_desc_переменная облачность, небольшие осадки","weather_desc_переменная облачность, небольшой дождь","weather_desc_переменная облачность, небольшой снег",weather_desc_снег,weather_desc_ясно,residue_0,residue_1,weekend_0,weekend_1
0,666676,666677,2022-02-14,4.79,87.3125,-1.9375,749.3125,,22.0,57.0,...,0,0,0,0,0,0,1,0,1,0
1,666677,666678,2022-02-15,4.79,88.7500,-1.2500,752.6875,,27.0,49.0,...,0,0,0,0,0,0,1,0,1,0
2,666678,666679,2022-02-16,4.79,90.3750,-1.5625,746.3125,,26.0,22.0,...,0,0,0,0,0,0,1,0,1,0
3,666679,666680,2022-02-17,4.79,98.0000,1.7500,732.6875,,24.0,27.0,...,0,0,0,0,0,0,0,1,1,0
4,666680,666681,2022-02-18,4.79,95.5000,1.3750,733.0000,,42.0,26.0,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24831,691507,691508,2022-02-16,2.48,91.8125,-2.5625,751.0000,,11.0,14.0,...,0,0,0,0,0,0,1,0,1,0
24832,691508,691509,2022-02-17,2.48,93.5000,2.0625,740.0000,,17.0,13.0,...,0,0,0,0,0,0,0,1,1,0
24833,691509,691510,2022-02-18,2.48,96.5625,2.0625,736.0000,,2.0,11.0,...,0,0,0,0,0,0,1,0,1,0
24834,691510,691511,2022-02-19,2.48,89.9375,2.7500,743.0000,,7.0,17.0,...,0,0,0,0,0,0,1,0,0,1


In [35]:
res = pd.DataFrame(pred_test_final, columns = ['prediction'])
res

Unnamed: 0,prediction
0,25.261220
1,28.090360
2,23.999968
3,27.037354
4,29.715886
...,...
24831,9.564948
24832,16.190873
24833,11.489784
24834,13.901378


In [36]:
res_final = pd.concat([X_test_final_id['id'], res['prediction']], join = 'outer', axis = 1, ignore_index = 1)

In [37]:
res_final.to_csv('prediction.csv', index = False)

In [38]:
res_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24836 entries, 0 to 24835
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       24836 non-null  int64  
 1   1       24836 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 388.2 KB


In [39]:
res_final

Unnamed: 0,0,1
0,666677,25.261220
1,666678,28.090360
2,666679,23.999968
3,666680,27.037354
4,666681,29.715886
...,...,...
24831,691508,9.564948
24832,691509,16.190873
24833,691510,11.489784
24834,691511,13.901378
