In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%pylab inline
plt.style.use("bmh")

Populating the interactive namespace from numpy and matplotlib


In [3]:
import pathlib
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error

In [4]:
import numpy as np
# импортируем классы KFold, TimeSeriesSplit и GroupKFold,
# реализующие стратегии перекрестной проверки, и
# класс GridSearchCV для поиска гиперпараметров
from sklearn.model_selection import (
    KFold,
    TimeSeriesSplit, 
    GroupKFold,
    GridSearchCV
)

from category_encoders import TargetEncoder, LeaveOneOutEncoder , OrdinalEncoder
import warnings
import xgboost as xgb
import lightgbm as lgb
import joblib

from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
# увеличиваем количество отображаемых столбцов
pd.set_option('display.max_columns', 50)

In [5]:
import sklearn
print (sklearn.__version__)

1.0.1


In [6]:
DATA_DIR = pathlib.Path(".")
DATA_FILE = "sc2021_train_deals.csv"
AGG_COLS = ["material_code", "company_code", "country", "region", "manager_code"]
RS = 82736

# Загрузка данных

In [7]:
data = pd.read_csv(DATA_DIR.joinpath(DATA_FILE), parse_dates=["month", "date"])

In [8]:
data.head()

Unnamed: 0,material_code,company_code,country,region,manager_code,month,material_lvl1_name,material_lvl2_name,material_lvl3_name,contract_type,date,volume
0,134,0,Литва,Литва,12261,2018-01-01,Базовые полимеры,ПЭ,ПЭНП,Спот,2018-01-01,43.0
1,197,0,Китай,Китай,16350,2018-01-01,Базовые полимеры,ПЭ,ПЭНП,Спот,2018-01-02,95.0
2,794,2162,Казахстан,Атырауская обл.,10942,2018-01-01,Базовые полимеры,ПП,ПП,Контракт,2018-01-02,57.0
3,134,0,Литва,Литва,12261,2018-01-01,Базовые полимеры,ПЭ,ПЭНП,Спот,2018-01-02,21.0
4,133,0,Китай,Китай,17745,2018-01-01,Базовые полимеры,ПЭ,ПЭНП,Спот,2018-01-02,150.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92306 entries, 0 to 92305
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   material_code       92306 non-null  int64         
 1   company_code        92306 non-null  int64         
 2   country             92306 non-null  object        
 3   region              92306 non-null  object        
 4   manager_code        92306 non-null  int64         
 5   month               92306 non-null  datetime64[ns]
 6   material_lvl1_name  92306 non-null  object        
 7   material_lvl2_name  92306 non-null  object        
 8   material_lvl3_name  92306 non-null  object        
 9   contract_type       92306 non-null  object        
 10  date                92306 non-null  datetime64[ns]
 11  volume              92306 non-null  float64       
dtypes: datetime64[ns](2), float64(1), int64(3), object(6)
memory usage: 8.5+ MB


Временной диапазон тренировочного множества:

In [10]:
data.month.min(), data.month.max()

(Timestamp('2018-01-01 00:00:00'), Timestamp('2020-07-01 00:00:00'))

# Временные ряды

In [11]:
group_ts = data.groupby(AGG_COLS + ["month"])["volume"].sum().unstack(fill_value=0)

In [12]:
group_ts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,month,2018-01-01,2018-02-01,2018-03-01,2018-04-01,2018-05-01,2018-06-01,2018-07-01,2018-08-01,2018-09-01,2018-10-01,2018-11-01,2018-12-01,2019-01-01,2019-02-01,2019-03-01,2019-04-01,2019-05-01,2019-06-01,2019-07-01,2019-08-01,2019-09-01,2019-10-01,2019-11-01,2019-12-01,2020-01-01,2020-02-01,2020-03-01,2020-04-01,2020-05-01,2020-06-01,2020-07-01
material_code,company_code,country,region,manager_code,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
124,7278,Россия,Респ. Татарстан,17460,340.0,340.0,260.0,240.0,220.0,220.0,220.0,220.0,220.0,280.0,280.0,280.0,200.0,200.0,200.0,185.0,103.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133,0,Белоруссия,Минская обл.,10942,0.0,0.0,0.0,200.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,98.0,82.0,62.0,145.0,124.0,181.0,208.0,207.0,17.0,72.0,250.0,394.0,288.0,210.0,249.0
133,0,Белоруссия,Могилевская обл.,10942,0.0,0.0,0.0,0.0,140.0,0.0,0.0,0.0,100.0,220.0,20.0,0.0,0.0,80.0,142.0,103.0,145.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,166.0,62.0,0.0,0.0
133,0,Белоруссия,г. Минск,10942,0.0,20.0,0.0,0.0,40.0,160.0,180.0,99.0,60.0,400.0,120.0,20.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,83.0,82.0,42.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,6.0
133,0,Казахстан,г. Нур-Султан,13301,0.0,0.0,30.0,30.0,0.0,0.0,40.0,20.0,40.0,30.0,0.0,40.0,40.0,50.0,0.0,40.0,0.0,40.0,40.0,40.0,0.0,45.0,50.0,45.0,0.0,50.0,40.0,0.0,0.0,50.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
986,9943,Россия,Смоленская обл.,17460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,63.0,125.0,84.0,84.0,83.0
998,0,Россия,Ленинградская обл.,18079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,5.0,0.0,0.0,5.0,5.0,0.0,8.0,5.0,10.0,5.0,5.0,10.0,10.0,0.0,6.0,5.0,5.0,5.0,0.0,3.0,3.0,9.0
998,3380,Россия,Ленинградская обл.,14956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,94.0,127.0,121.0,121.0,129.0,117.0,115.0,102.0,29.0,73.0,74.0,122.0,100.0,15.0,30.0,50.0
998,5410,Россия,г. Санкт-Петербург,14956,60.0,60.0,100.0,60.0,60.0,80.0,80.0,100.0,80.0,80.0,80.0,100.0,120.0,119.0,160.0,120.0,140.0,100.0,120.0,120.0,80.0,120.0,140.0,100.0,100.0,180.0,180.0,100.0,140.0,40.0,0.0


# CatBoostRegressor

Признаки:

- оригинальные категориальные признаки,
- месяц, для которого предсказываем,
- среднее, минимум и максимум за год,
- последние 6 месяцев до месяца, для которого предсказываем.

Для тренировки будем использовать период `2019-01-01` по `2019-06-01`, для валидации: с `2019-07-01` по `2019-12-01`, для тестирования: с `2020-01-01` по `2020-07-01`.

In [13]:
def get_features(df: pd.DataFrame, month: pd.Timestamp, N=6,MNGR_GRP_MDAD=0,  MNGR_GRP_MDAD2=0) -> pd.DataFrame:  # ,  MNGR_GRP_MDAD=7)
    """Calculate features for `month`."""

    start_period = month - pd.offsets.MonthBegin(N)
    end_period = month - pd.offsets.MonthBegin(1)

    df = df.loc[:, :end_period]

    features = pd.DataFrame([], index=df.index)
    features["month"] = month.month
    # формируем лаги за N месяцев
    features[[f"vol_tm{i}" for i in range(N, 0, -1)]] = df.loc[:, start_period:end_period].copy()

    ### !!!!!!!!!!!!!!   #################################################################################
    rolling = df.rolling(12, axis=1, min_periods=1)
    features = features.join(rolling.mean().iloc[:, -1].rename("last_year_avg"))
    
    # Добавление скользящих средних абсолютные отклонения(MDAD)
    rolling = df.rolling(2, axis=1, min_periods=1)
    features = features.join( 
                    rolling.apply(lambda x: np.nanmedian(np.abs(x - np.nanmedian(x))) , raw=True 
                    ).iloc[:, -1].rename("mdad2") )
   
    # Добавление ГРУППОВЫХ скользящих средних абсолютные отклонения(MDAD)
    if MNGR_GRP_MDAD != 0:
        period = MNGR_GRP_MDAD
        df2 = df.copy()
        df2[df2.columns.to_list()] = \
                                df2.groupby(level='manager_code').transform(lambda x: x.mean())
        grp_manager_roll_mean = df2.rolling(period, axis=1, min_periods=1)
        features = \
        features.join(grp_manager_roll_mean.apply(lambda x: np.nanmedian(np.abs(x - np.nanmedian(x))) , raw=True 
                        ).iloc[:, -1].rename("MNGR_GRP_MDAD"+str(period)))
        
    # Добавление ГРУППОВЫХ скользящих средних абсолютные отклонения(MDAD)
    if MNGR_GRP_MDAD2 != 0:
        period = MNGR_GRP_MDAD2
        df2 = df.copy()
        df2[df2.columns.to_list()] = \
                                df2.groupby(level='manager_code').transform(lambda x: x.mean())
        grp_manager_roll_mean = df2.rolling(period, axis=1, min_periods=1)
        features = \
        features.join(grp_manager_roll_mean.apply(lambda x: np.nanmedian(np.abs(x - np.nanmedian(x))) , raw=True 
                        ).iloc[:, -1].rename("MNGR_GRP_MDAD"+str(period)))
    ##################################################################################################
    ##################################################################################################

    # выделяем КВАРТАЛЫ
    #features["quarter"] = month.quarter
    #features["month2"] = month

    return features

In [14]:
tr_range = pd.date_range("2019-01-01", "2019-06-01", freq="MS")
val_range = pd.date_range("2019-07-01", "2019-12-01", freq="MS")
ts_range = pd.date_range("2020-01-01", "2020-07-01", freq="MS")

In [15]:
full_features = {}

for dataset, dataset_range in zip(["tr", "val", "ts"], [tr_range, val_range, ts_range]):
    dataset_features = []
    for target_month in dataset_range:
        features = get_features(group_ts, target_month)
        features["target"] = group_ts[target_month]
        dataset_features.append(features.reset_index())
    full_features[dataset] = pd.concat(dataset_features, ignore_index=True)

In [16]:
full_features["tr"].head()

Unnamed: 0,material_code,company_code,country,region,manager_code,month,vol_tm6,vol_tm5,vol_tm4,vol_tm3,vol_tm2,vol_tm1,last_year_avg,mdad2,target
0,124,7278,Россия,Респ. Татарстан,17460,1,220.0,220.0,220.0,280.0,280.0,280.0,260.0,0.0,200.0
1,133,0,Белоруссия,Минская обл.,10942,1,0.0,0.0,0.0,0.0,0.0,0.0,21.666667,0.0,0.0
2,133,0,Белоруссия,Могилевская обл.,10942,1,0.0,0.0,100.0,220.0,20.0,0.0,40.0,10.0,0.0
3,133,0,Белоруссия,г. Минск,10942,1,180.0,99.0,60.0,400.0,120.0,20.0,91.583333,50.0,40.0
4,133,0,Казахстан,г. Нур-Султан,13301,1,40.0,20.0,40.0,30.0,0.0,40.0,19.166667,20.0,40.0


In [17]:
for dataset in ["tr", "val", "ts"]:
    for c in full_features[dataset].columns:
        col_type = full_features[dataset][c].dtype
        if col_type == 'object': # or col_type == 'int64': # or col_type.name == 'category':
            full_features[dataset][c] = full_features[dataset][c].astype('category')

In [18]:
full_features["tr"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5646 entries, 0 to 5645
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   material_code  5646 non-null   int64   
 1   company_code   5646 non-null   int64   
 2   country        5646 non-null   category
 3   region         5646 non-null   category
 4   manager_code   5646 non-null   int64   
 5   month          5646 non-null   int64   
 6   vol_tm6        5646 non-null   float64 
 7   vol_tm5        5646 non-null   float64 
 8   vol_tm4        5646 non-null   float64 
 9   vol_tm3        5646 non-null   float64 
 10  vol_tm2        5646 non-null   float64 
 11  vol_tm1        5646 non-null   float64 
 12  last_year_avg  5646 non-null   float64 
 13  mdad2          5646 non-null   float64 
 14  target         5646 non-null   float64 
dtypes: category(2), float64(9), int64(4)
memory usage: 590.7 KB


## Тренировка

In [19]:
CAT_COLS = ["material_code", "company_code", "country", "region", "manager_code", "month"]
#CAT_COLS3 = ["material_code", "company_code",  "manager_code", "month"]
CAT_COLS2 = ["country", "region"]
CAT_COLS_LGB = ["name_material_code", "name_company_code",  "name_manager_code", "name_month"]
TARGET = "target"

In [20]:
# создаем список  переменных
FTS_COLS = full_features["tr"].columns.tolist()
FTS_COLS.remove('target')
print(FTS_COLS)

['material_code', 'company_code', 'country', 'region', 'manager_code', 'month', 'vol_tm6', 'vol_tm5', 'vol_tm4', 'vol_tm3', 'vol_tm2', 'vol_tm1', 'last_year_avg', 'mdad2']


In [21]:
FINAL_n_estimators = 30 #25 
FINAL_subsample = 0.5
FINAL_min_samples_leaf = 11 #25 
FINAL_max_features = 1

# [1.5403588928614802, 'ITER=>', 50, 'LEARNING_RATE->', 0.08, 'DEPTH=', 4]

ITERATION_ctbst = 50              # 52  # 50 # T12(Cv12_i50i008d4.zip)
LEARNING_RATE_ctbst = 0.08 #0.079  #     # 0.008 # T12(Cv12_i50i008d4.zip)
DEPTH_ctbst =  4         # 3              # T12(Cv12_i50i008d4.zip)
model = CatBoostRegressor(iterations=ITERATION_ctbst,  
                          learning_rate=LEARNING_RATE_ctbst, 
                          depth=DEPTH_ctbst, 
                          cat_features=CAT_COLS,
                          random_state=RS,
                          verbose=0)


# создаем экземпляр модели LGBMRegressor
ITERATION_lgb = 34
LEARNING_RATE_lgb = 0.0813 
DEPTH_lgb = 3
modelLGBM = lgb.LGBMRegressor(learning_rate=LEARNING_RATE_lgb,                          
                               max_depth=DEPTH_lgb,
                               n_estimators=ITERATION_lgb,
                               subsample=0.8,
                               colsample_bytree=1., 
                              #categorical_feature=CAT_COLS_LGB,
                               random_state=RS)



final_estimator = GradientBoostingRegressor(
         n_estimators=FINAL_n_estimators, #25, 
        subsample=FINAL_subsample, # 0.5, 
        min_samples_leaf=FINAL_min_samples_leaf, # 25, 
        max_features=FINAL_max_features, # 1,
             random_state=42)

'''    
final_estimator=RandomForestRegressor(n_estimators=FINAL_n_estimators,
                      random_state=42)
'''

estimators =[('ctbst', model ),
             ('lgb', modelLGBM ),
             #('xgb', xgb_model)
            ]

reg = StackingRegressor(estimators=estimators,
                        final_estimator=final_estimator,
                        passthrough=False)

In [22]:
reg.fit(full_features["tr"][FTS_COLS], 
          # обучаем модель, используя логарифмирование зависимой 
          np.log1p(full_features["tr"][TARGET]), 
          #eval_set=(full_features["val"][FTS_COLS], full_features["val"][TARGET])
         )

StackingRegressor(estimators=[('ctbst',
                               <catboost.core.CatBoostRegressor object at 0x000002360F3BE490>),
                              ('lgb',
                               LGBMRegressor(learning_rate=0.0813, max_depth=3,
                                             n_estimators=34,
                                             random_state=82736,
                                             subsample=0.8))],
                  final_estimator=GradientBoostingRegressor(max_features=1,
                                                            min_samples_leaf=11,
                                                            n_estimators=30,
                                                            random_state=42,
                                                            subsample=0.5))

In [24]:
# получаем прогнозы, перед этим выполнив экспоненцирование - операцию,
# обратную логарифмированию
tr_preds = np.expm1(reg.predict(full_features["tr"][FTS_COLS]))
val_preds = np.expm1(reg.predict(full_features["val"][FTS_COLS]))
ts_preds = np.expm1(reg.predict(full_features["ts"][FTS_COLS]))
tr_preds = pd.Series(tr_preds).clip(lower=0)
val_preds = pd.Series(val_preds).clip(lower=0)
ts_preds = pd.Series(ts_preds).clip(lower=0)

In [25]:
print("Ошибка на тренировочном множестве:",
      f'{np.sqrt(mean_squared_log_error(full_features["tr"][TARGET], tr_preds)):.4f}')
print("Ошибка на валидационном множестве:",
      f'{np.sqrt(mean_squared_log_error(full_features["val"][TARGET], val_preds)):.4f}')
print("Ошибка на тестовом множестве:",
      f'{np.sqrt(mean_squared_log_error(full_features["ts"][TARGET], ts_preds)):.4f}')

Ошибка на тренировочном множестве: 1.4638
Ошибка на валидационном множестве: 1.5456
Ошибка на тестовом множестве: 1.7388


In [26]:
full_features["tr"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5646 entries, 0 to 5645
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   material_code  5646 non-null   int64   
 1   company_code   5646 non-null   int64   
 2   country        5646 non-null   category
 3   region         5646 non-null   category
 4   manager_code   5646 non-null   int64   
 5   month          5646 non-null   int64   
 6   vol_tm6        5646 non-null   float64 
 7   vol_tm5        5646 non-null   float64 
 8   vol_tm4        5646 non-null   float64 
 9   vol_tm3        5646 non-null   float64 
 10  vol_tm2        5646 non-null   float64 
 11  vol_tm1        5646 non-null   float64 
 12  last_year_avg  5646 non-null   float64 
 13  mdad2          5646 non-null   float64 
 14  target         5646 non-null   float64 
dtypes: category(2), float64(9), int64(4)
memory usage: 590.7 KB


Ошибка на тренировочном множестве: 1.4884
Ошибка на валидационном множестве: 1.5461
Ошибка на тестовом множестве: 1.7368

# ###########################################################################

In [27]:
def get_features_Cv(df: pd.DataFrame, month: pd.Timestamp, N=6, MDAD_1=2,MNGR_GRP_MDAD=0,  MNGR_GRP_MDAD2=0) -> pd.DataFrame: # ,  MNGR_GRP_MDAD=7)
    """Calculate features for `month`."""

    start_period = month - pd.offsets.MonthBegin(N)
    end_period = month - pd.offsets.MonthBegin(1)

    df = df.loc[:, :end_period]

    features = pd.DataFrame([], index=df.index)
    features["month"] = month.month
    # формируем лаги за N месяцев
    features[[f"vol_tm{i}" for i in range(N, 0, -1)]] = df.loc[:, start_period:end_period].copy()

    ### !!!!!!!!!!!!!!   #################################################################################
    rolling = df.rolling(12, axis=1, min_periods=1)
    features = features.join(rolling.mean().iloc[:, -1].rename("last_year_avg"))
    
    # Добавление скользящих средних абсолютные отклонения(MDAD)
    period = MDAD_1
    rolling = df.rolling(period, axis=1, min_periods=1)
    features = features.join( 
                    rolling.apply(lambda x: np.nanmedian(np.abs(x - np.nanmedian(x))) , raw=True 
                    ).iloc[:, -1].rename("mdad"+str(period)) )
    
    # Добавление ГРУППОВЫХ скользящих средних абсолютные отклонения(MDAD)
    if MNGR_GRP_MDAD != 0:
        period = MNGR_GRP_MDAD
        df2 = df.copy()
        df2[df2.columns.to_list()] = \
                                df2.groupby(level='manager_code').transform(lambda x: x.mean())
        grp_manager_roll_mean = df2.rolling(period, axis=1, min_periods=1)
        features = \
        features.join(grp_manager_roll_mean.apply(lambda x: np.nanmedian(np.abs(x - np.nanmedian(x))) , raw=True 
                        ).iloc[:, -1].rename("MNGR_GRP_MDAD"+str(period)))
        
    # Добавление ГРУППОВЫХ скользящих средних абсолютные отклонения(MDAD)
    if MNGR_GRP_MDAD2 != 0:
        period = MNGR_GRP_MDAD2
        df2 = df.copy()
        df2[df2.columns.to_list()] = \
                                df2.groupby(level='manager_code').transform(lambda x: x.mean())
        grp_manager_roll_mean = df2.rolling(period, axis=1, min_periods=1)
        features = \
        features.join(grp_manager_roll_mean.apply(lambda x: np.nanmedian(np.abs(x - np.nanmedian(x))) , raw=True 
                        ).iloc[:, -1].rename("MNGR_GRP_MDAD"+str(period)))
    ##################################################################################################
    ##################################################################################################

    # выделяем КВАРТАЛЫ
    #features["quarter"] = month.quarter

    return features

In [28]:
full_features2 = {}
dataset_range = pd.date_range( "2019-01-01", group_ts.columns[-1], freq="MS")
dataset_features2 = []
for target_month in dataset_range:
    #print(target_month)
    features2 = get_features_Cv(group_ts, target_month, MDAD_1=2,  MNGR_GRP_MDAD=0,  MNGR_GRP_MDAD2=0) # ,  MNGR_GRP_MDAD=7)
    features2["target"] = group_ts[target_month]
    dataset_features2.append(features2.reset_index())
full_features2 = pd.concat(dataset_features2, ignore_index=True)
print(FTS_COLS)

['material_code', 'company_code', 'country', 'region', 'manager_code', 'month', 'vol_tm6', 'vol_tm5', 'vol_tm4', 'vol_tm3', 'vol_tm2', 'vol_tm1', 'last_year_avg', 'mdad2']


In [29]:
full_features2

Unnamed: 0,material_code,company_code,country,region,manager_code,month,vol_tm6,vol_tm5,vol_tm4,vol_tm3,vol_tm2,vol_tm1,last_year_avg,mdad2,target
0,124,7278,Россия,Респ. Татарстан,17460,1,220.0,220.0,220.0,280.0,280.0,280.0,260.000000,0.0,200.0
1,133,0,Белоруссия,Минская обл.,10942,1,0.0,0.0,0.0,0.0,0.0,0.0,21.666667,0.0,0.0
2,133,0,Белоруссия,Могилевская обл.,10942,1,0.0,0.0,100.0,220.0,20.0,0.0,40.000000,10.0,0.0
3,133,0,Белоруссия,г. Минск,10942,1,180.0,99.0,60.0,400.0,120.0,20.0,91.583333,50.0,40.0
4,133,0,Казахстан,г. Нур-Султан,13301,1,40.0,20.0,40.0,30.0,0.0,40.0,19.166667,20.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17874,986,9943,Россия,Смоленская обл.,17460,7,0.0,21.0,63.0,125.0,84.0,84.0,31.416667,0.0,83.0
17875,998,0,Россия,Ленинградская обл.,18079,7,5.0,5.0,5.0,0.0,3.0,3.0,4.750000,0.0,9.0
17876,998,3380,Россия,Ленинградская обл.,14956,7,73.0,74.0,122.0,100.0,15.0,30.0,85.583333,7.5,50.0
17877,998,5410,Россия,г. Санкт-Петербург,14956,7,100.0,180.0,180.0,100.0,140.0,40.0,118.333333,50.0,0.0


In [30]:
for c in full_features2.columns:
    col_type = full_features2[c].dtype
    if col_type == 'object': # or col_type == 'int64': # or col_type.name == 'category':
        full_features2[c] = full_features2[c].astype('category')
full_features2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17879 entries, 0 to 17878
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   material_code  17879 non-null  int64   
 1   company_code   17879 non-null  int64   
 2   country        17879 non-null  category
 3   region         17879 non-null  category
 4   manager_code   17879 non-null  int64   
 5   month          17879 non-null  int64   
 6   vol_tm6        17879 non-null  float64 
 7   vol_tm5        17879 non-null  float64 
 8   vol_tm4        17879 non-null  float64 
 9   vol_tm3        17879 non-null  float64 
 10  vol_tm2        17879 non-null  float64 
 11  vol_tm1        17879 non-null  float64 
 12  last_year_avg  17879 non-null  float64 
 13  mdad2          17879 non-null  float64 
 14  target         17879 non-null  float64 
dtypes: category(2), float64(9), int64(4)
memory usage: 1.8 MB


In [31]:
FINAL_n_estimators = 30 #25 
FINAL_subsample = 0.5
FINAL_min_samples_leaf = 11 #25 
FINAL_max_features = 1

# [1.5403588928614802, 'ITER=>', 50, 'LEARNING_RATE->', 0.08, 'DEPTH=', 4]

ITERATION_ctbst = 50 # 52  #50              #  50 # T12(Cv12_i50i008d4.zip)
LEARNING_RATE_ctbst = 0.08 # 0.079  #0.008 #     # 0.008 # T12(Cv12_i50i008d4.zip)
DEPTH_ctbst = 4         # 3              #4         #  T12(Cv12_i50i008d4.zip)
model = CatBoostRegressor(iterations=ITERATION_ctbst,  
                          learning_rate=LEARNING_RATE_ctbst, 
                          depth=DEPTH_ctbst, 
                          cat_features=CAT_COLS,
                          random_state=RS,
                          verbose=0)


# создаем экземпляр модели LGBMRegressor
ITERATION_lgb = 34
LEARNING_RATE_lgb = 0.0813 
DEPTH_lgb = 3
modelLGBM = lgb.LGBMRegressor(learning_rate=LEARNING_RATE_lgb,                          
                               max_depth=DEPTH_lgb,
                               n_estimators=ITERATION_lgb,
                               subsample=0.8,
                               colsample_bytree=1.,
                               random_state=RS)



final_estimator = GradientBoostingRegressor(
         n_estimators=FINAL_n_estimators, #25, 
        subsample=FINAL_subsample, # 0.5, 
        min_samples_leaf=FINAL_min_samples_leaf, # 25, 
        max_features=FINAL_max_features, # 1,
             random_state=42)

'''    
final_estimator=RandomForestRegressor(n_estimators=FINAL_n_estimators,
                      random_state=42)
'''

estimators =[('ctbst', model ),
             ('lgb', modelLGBM ),
             #('xgb', xgb_model)
            ]

reg = StackingRegressor(estimators=estimators,
                        final_estimator=final_estimator,
                        passthrough=False)

In [32]:
reg.fit(full_features2[FTS_COLS],
              # обучаем модель, используя логарифмирование зависимой 
              np.log1p(full_features2[TARGET]))

StackingRegressor(estimators=[('ctbst',
                               <catboost.core.CatBoostRegressor object at 0x000002360F4490A0>),
                              ('lgb',
                               LGBMRegressor(learning_rate=0.0813, max_depth=3,
                                             n_estimators=34,
                                             random_state=82736,
                                             subsample=0.8))],
                  final_estimator=GradientBoostingRegressor(max_features=1,
                                                            min_samples_leaf=11,
                                                            n_estimators=30,
                                                            random_state=42,
                                                            subsample=0.5))

In [33]:
import dill

#dill.dump(encoder_1, file = open("ET_encoder_1.pkl", "wb"))
#dill.dump(encoder_2, file = open("ET_encoder_2.pkl", "wb"))

dill.dump(reg, file = open("STACK_1.cbm", "wb"))

In [34]:
%%writefile predict.py

import pathlib
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import lightgbm as lgb

import dill
#from category_encoders import TargetEncoder #OrdinalEncoder
#from sklearn.preprocessing import LabelEncoder

MODEL_FILE = pathlib.Path(__file__).parent.joinpath("STACK_1.cbm")
#ENCODER_1_FILE = pathlib.Path(__file__).parent.joinpath("ET_encoder_1.pkl")
#ENCODER_2_FILE = pathlib.Path(__file__).parent.joinpath("ET_encoder_2.pkl")

AGG_COLS = ["material_code", "company_code", "country", "region", "manager_code"]
CAT_COLS = ["material_code", "company_code", "country", "region", "manager_code", "month"]
FTS_COLS = ['material_code', 'company_code', 'country', 'region', 'manager_code', 'month', 
            'vol_tm6', 'vol_tm5', 'vol_tm4', 'vol_tm3', 'vol_tm2', 'vol_tm1', 'last_year_avg', 'mdad2']

TARGET = "target"

def get_features(df: pd.DataFrame, month: pd.Timestamp) -> pd.DataFrame:
    """Вычисление признаков для `month`."""

    start_period = month - pd.offsets.MonthBegin(6)
    end_period = month - pd.offsets.MonthBegin(1)

    df = df.loc[:, :end_period]

    features = pd.DataFrame([], index=df.index)
    features["month"] = month.month
    features[[f"vol_tm{i}" for i in range(6, 0, -1)]] = df.loc[:, start_period:end_period].copy()

   ### !!!!!!!!!!!!!!   #################################################################################
    rolling = df.rolling(12, axis=1, min_periods=1)
    features = features.join(rolling.mean().iloc[:, -1].rename("last_year_avg"))
    
    # Добавление скользящих средних абсолютные отклонения(MDAD)
    rolling = df.rolling(2, axis=1, min_periods=1)
    features = features.join( 
                    rolling.apply(lambda x: np.nanmedian(np.abs(x - np.nanmedian(x))) , raw=True 
                    ).iloc[:, -1].rename("mdad2") )
    
    return features.reset_index()


def predict(df: pd.DataFrame, month: pd.Timestamp) -> pd.DataFrame:

    model = dill.load(open(MODEL_FILE, "rb"))
    #encoder_1 =  dill.load(open(ENCODER_1_FILE, "rb"))
    #encoder_2 =  dill.load(open(ENCODER_2_FILE, "rb"))
    
    
    group_ts = df.groupby(AGG_COLS + ["month"])["volume"].sum().unstack(fill_value=0)
    features = get_features(group_ts, month)
    for c in features.columns:
        col_type = features[c].dtype
        if col_type == 'object': 
            features[c] = features[c].astype('category')    
    
    #features[CAT_COLS] = encoder.transform(features[CAT_COLS])
    #features["country"] = encoder_1.transform(features["country"])
    #features["region"]  = encoder_2.transform(features["region"])
    
    predictions = np.expm1(model.predict(features[FTS_COLS]))

    preds_df = features[AGG_COLS].copy()
    preds_df["prediction"] = predictions
    return preds_df

Overwriting predict.py


In [35]:
import predict
import importlib
importlib.reload(predict)

<module 'predict' from 'C:\\Users\\dimacv\\PROJECTS\\Соревнования\\Sibur2021\\predict.py'>

In [36]:
predict.get_features(group_ts.iloc[:, :-1], pd.Timestamp("2020-07-01"))

Unnamed: 0,material_code,company_code,country,region,manager_code,month,vol_tm6,vol_tm5,vol_tm4,vol_tm3,vol_tm2,vol_tm1,last_year_avg,mdad2
0,124,7278,Россия,Респ. Татарстан,17460,7,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,133,0,Белоруссия,Минская обл.,10942,7,17.0,72.0,250.0,394.0,288.0,210.0,179.833333,39.0
2,133,0,Белоруссия,Могилевская обл.,10942,7,0.0,0.0,0.0,166.0,62.0,0.0,19.000000,31.0
3,133,0,Белоруссия,г. Минск,10942,7,0.0,0.0,0.0,21.0,0.0,0.0,22.416667,0.0
4,133,0,Казахстан,г. Нур-Султан,13301,7,0.0,50.0,40.0,0.0,0.0,50.0,30.000000,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936,986,9943,Россия,Смоленская обл.,17460,7,0.0,21.0,63.0,125.0,84.0,84.0,31.416667,0.0
937,998,0,Россия,Ленинградская обл.,18079,7,5.0,5.0,5.0,0.0,3.0,3.0,4.750000,0.0
938,998,3380,Россия,Ленинградская обл.,14956,7,73.0,74.0,122.0,100.0,15.0,30.0,85.583333,7.5
939,998,5410,Россия,г. Санкт-Петербург,14956,7,100.0,180.0,180.0,100.0,140.0,40.0,118.333333,50.0


In [37]:
ts_preds_tst = predict.predict(data[data.month<"2020-07-01"], pd.Timestamp("2020-07-01"))
ts_preds_tst 

Unnamed: 0,material_code,company_code,country,region,manager_code,prediction
0,124,7278,Россия,Респ. Татарстан,17460,1.119568
1,133,0,Белоруссия,Минская обл.,10942,133.063111
2,133,0,Белоруссия,Могилевская обл.,10942,6.955001
3,133,0,Белоруссия,г. Минск,10942,1.815469
4,133,0,Казахстан,г. Нур-Султан,13301,15.046590
...,...,...,...,...,...,...
936,986,9943,Россия,Смоленская обл.,17460,40.790878
937,998,0,Россия,Ленинградская обл.,18079,2.157092
938,998,3380,Россия,Ленинградская обл.,14956,23.421482
939,998,5410,Россия,г. Санкт-Петербург,14956,60.735707


In [38]:
print("Ошибка на тестовом множестве:",
      f'{np.sqrt(mean_squared_log_error(group_ts.reset_index().iloc[:,-1], ts_preds_tst["prediction"])):.4f}')

Ошибка на тестовом множестве: 1.5530


In [None]:
# Упаковка в zip
!tar.exe -a -c -f Stack_31.zip STACK_1.cbm requirements.txt predict.py