In [1]:
import os
import boto3
from dotenv import load_dotenv
import io 
import pandas as pd

load_dotenv()

YC_ACCESS_KEY_ID = os.getenv("YC_ACCESS_KEY_ID")
YC_SECRET_ACCESS_KEY = os.getenv("YC_SECRET_ACCESS_KEY")
YC_ENDPOINT_URL = os.getenv("YC_ENDPOINT_URL")
YC_BUCKET_NAME = os.getenv("YC_BUCKET_NAME")

In [2]:
session = boto3.session.Session()
s3_client = session.client(
    service_name='s3',
    endpoint_url=YC_ENDPOINT_URL,
    aws_access_key_id=YC_ACCESS_KEY_ID,
    aws_secret_access_key=YC_SECRET_ACCESS_KEY
)

In [3]:
import gzip

file_names = ["item_categories.csv.gzip", "items.csv.gzip", "sample_submission.csv.gzip", "shops.csv.gzip", "test.csv.gzip", "train.csv.gzip"]
data_location = "filtered_data/"

data_storage = dict()
for file_name in file_names:
    response = s3_client.get_object(Bucket=YC_BUCKET_NAME, Key=f'{data_location}{file_name}')
    data_storage[file_name[: len(file_name) - 9]] = pd.read_csv(io.BytesIO(response['Body'].read()), compression='gzip')

data_storage["train"]['date'] = pd.to_datetime(data_storage["train"]['date'], format = '%Y-%m-%d')



In [4]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

from src.data_preprocessor import DataPreprocessor
from src.validation_schema import TimeSeriesRollingValidator


proc = DataPreprocessor(data_storage)
validator = TimeSeriesRollingValidator(data_storage['train'],  'date_block_num', train_window=24, test_window=1)


In [18]:
from itertools import product
import numpy as np

def create_testlike_train(df):
    matrix = []
    min_date = df['date'].min()
    for i in range(df['date_block_num'].min(), df['date_block_num'].max()+1):
        shops = df[df['date_block_num'] == i]['shop_id'].unique()
        items = df[df['date_block_num'] == i]['item_id'].unique()
        month_start = min_date + pd.tseries.offsets.DateOffset(months = i)
        matrix.append( np.array( list(product([i],[month_start],shops,items))))
    df_new = pd.DataFrame(np.vstack(matrix),columns = ['date_block_num','month_start','shop_id','item_id'])
    pivot = pd.pivot_table(df, 
                            values = ['item_cnt_day'], 
                            index = ['date_block_num','shop_id','item_id'], 
                            aggfunc = 'sum').reset_index()
    

    pivot2 = pd.pivot_table(df[df['item_cnt_day']>0], 
                            values = ['item_cnt_day'], 
                            index = ['date_block_num','shop_id','item_id'], 
                            aggfunc = 'count').reset_index()
    pivot2.rename(columns={'item_cnt_day': 'purch_cnt_month'}, inplace=True)
    
    df_new = df_new.merge(right = pivot, how = 'left', on = ['date_block_num','shop_id','item_id'], sort = False)
    df_new = df_new.merge(right = pivot2, how = 'left', on = ['date_block_num','shop_id','item_id'], sort = False)
    
    df_new.rename(columns={'item_cnt_day': 'item_cnt_month_uncl'}, inplace=True)
    df_new['item_cnt_month_uncl'] = df_new['item_cnt_month_uncl'].fillna(0)
    df_new['item_cnt_month'] = df_new['item_cnt_month_uncl'].clip(0,20)
    return df_new

In [20]:
data_storage['train'] = create_testlike_train(data_storage['train'])

In [17]:
data_storage['train']

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.00,1.0
1,2013-01-03,0,25,2552,899.00,1.0
2,2013-01-06,0,25,2554,1709.05,1.0
3,2013-01-15,0,25,2555,1099.00,1.0
4,2013-01-10,0,25,2564,349.00,1.0
...,...,...,...,...,...,...
2361346,2015-10-10,33,25,7409,299.00,1.0
2361347,2015-10-09,33,25,7460,299.00,1.0
2361348,2015-10-14,33,25,7459,349.00,1.0
2361349,2015-10-22,33,25,7440,299.00,1.0


In [21]:
data_storage['train']

Unnamed: 0,date_block_num,month_start,shop_id,item_id,item_cnt_month_uncl,purch_cnt_month,item_cnt_month
0,0,2013-01-01,59,22154,1.0,1.0,1.0
1,0,2013-01-01,59,2552,0.0,,0.0
2,0,2013-01-01,59,2554,0.0,,0.0
3,0,2013-01-01,59,2555,0.0,,0.0
4,0,2013-01-01,59,2564,0.0,,0.0
...,...,...,...,...,...,...,...
8520159,33,2015-10-01,21,7635,0.0,,0.0
8520160,33,2015-10-01,21,7638,0.0,,0.0
8520161,33,2015-10-01,21,7640,0.0,,0.0
8520162,33,2015-10-01,21,7632,0.0,,0.0


In [22]:
data_storage['train'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8520164 entries, 0 to 8520163
Data columns (total 7 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date_block_num       object        
 1   month_start          datetime64[ns]
 2   shop_id              object        
 3   item_id              object        
 4   item_cnt_month_uncl  float64       
 5   purch_cnt_month      float64       
 6   item_cnt_month       float64       
dtypes: datetime64[ns](1), float64(3), object(3)
memory usage: 455.0+ MB


In [23]:
#splits = validator.split_data_rolling()
data_storage['train'] = proc.group_by_month(data_storage['train'])

KeyError: "Column(s) ['item_cnt_day', 'item_price'] do not exist"

In [24]:
data_storage['train'] = data_storage['train'][data_storage['train']['date_block_num'] > 17]

In [25]:
import pandas as pd

def fill_missing_combinations(df: pd.DataFrame) -> pd.DataFrame:
    unique_date_block_num = df['date_block_num'].unique()
    unique_shop_id = df['shop_id'].unique()
    unique_item_id = df['item_id'].unique()

    all_combinations = pd.MultiIndex.from_product(
        [unique_date_block_num, unique_shop_id, unique_item_id],
        names=['date_block_num', 'shop_id', 'item_id']
    ).to_frame(index=False)
    
    merged_df = pd.merge(
        all_combinations,
        df,
        on=['date_block_num', 'shop_id', 'item_id'],
        how='left'
    )

    merged_df['item_cnt_month_'] = merged_df['item_cnt_month_'].fillna(0)
    
    return merged_df

In [9]:
data_storage['train'] = fill_missing_combinations(data_storage['train'])

In [26]:
data_storage['train']

Unnamed: 0,date_block_num,month_start,shop_id,item_id,item_cnt_month_uncl,purch_cnt_month,item_cnt_month
4856775,18,2014-07-01,16,5486,1.0,1.0,1.0
4856776,18,2014-07-01,16,22092,0.0,,0.0
4856777,18,2014-07-01,16,2416,5.0,2.0,5.0
4856778,18,2014-07-01,16,2215,0.0,,0.0
4856779,18,2014-07-01,16,2163,1.0,1.0,1.0
...,...,...,...,...,...,...,...
8520159,33,2015-10-01,21,7635,0.0,,0.0
8520160,33,2015-10-01,21,7638,0.0,,0.0
8520161,33,2015-10-01,21,7640,0.0,,0.0
8520162,33,2015-10-01,21,7632,0.0,,0.0


In [27]:
data_storage['train'] = proc.add_revenue(data_storage['train'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['revenue'] = 0


In [28]:
data_storage['train'] = proc.add_month_and_days(data_storage['train'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['month'] = train[month_column] % 12 + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['days_in_month'] = train['month'].map(days_in_month_dict)


In [29]:
data_storage['train'] = proc.add_cat_features(data_storage['train'])

cat_features = ['shop_city', 'shop_type', 'item_category_type', 'item_category_subtype']
data_storage['train'] = proc.encode_categorical_features(data_storage['train'], cat_features=cat_features)


In [30]:
df = data_storage['train']

In [32]:
df

Unnamed: 0,date_block_num,month_start,shop_id,item_id,item_cnt_month_uncl,purch_cnt_month,item_cnt_month,revenue,month,days_in_month,item_name,item_category_id,item_category_name,item_category_type,item_category_subtype,shop_name,shop_city,shop_type
0,18,2014-07-01,16,5486,1.0,1.0,1.0,0,7,31,PINK FLOYD The Division Bell Original Record...,58,Музыка - Винил,12,27,"Коломна ТЦ ""Рио""",9,6
1,18,2014-07-01,16,22092,0.0,,0.0,0,7,31,Элемент питания DURACELL TURBO LR6 2*BL,83,Элементы питания,18,59,"Коломна ТЦ ""Рио""",9,6
2,18,2014-07-01,16,2416,5.0,2.0,5.0,0,7,31,"Counter Strike. Global Offensive [PC, Jewel, р...",30,Игры PC - Стандартные издания,7,51,"Коломна ТЦ ""Рио""",9,6
3,18,2014-07-01,16,2215,0.0,,0.0,0,7,31,COLTRANE JOHN Very Best Of,55,Музыка - CD локального производства,12,2,"Коломна ТЦ ""Рио""",9,6
4,18,2014-07-01,16,2163,1.0,1.0,1.0,0,7,31,CLAPTON ERIC & B.B. KING Riding With The King,55,Музыка - CD локального производства,12,2,"Коломна ТЦ ""Рио""",9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3663384,33,2015-10-01,21,7635,0.0,,0.0,0,10,31,WARHAMMER MINIATURES: Dark Elf Doomfire Warloc...,64,Подарки - Настольные игры,13,40,"Москва МТРЦ ""Афи Молл""",12,1
3663385,33,2015-10-01,21,7638,0.0,,0.0,0,10,31,WARHAMMER MINIATURES: Savage Orcs арт. 89-19,64,Подарки - Настольные игры,13,40,"Москва МТРЦ ""Афи Молл""",12,1
3663386,33,2015-10-01,21,7640,0.0,,0.0,0,10,31,WARHAMMER MINIATURES: Stormcast Eternals Palad...,64,Подарки - Настольные игры,13,40,"Москва МТРЦ ""Афи Молл""",12,1
3663387,33,2015-10-01,21,7632,0.0,,0.0,0,10,31,WARHAMMER ACCESSORIES: Space Marine Paint Set ...,64,Подарки - Настольные игры,13,40,"Москва МТРЦ ""Афи Молл""",12,1


In [34]:
train = df[['date_block_num', 'shop_id', 'item_id', 'month', 'days_in_month', 'item_category_id', 'shop_city', 'shop_type', 'item_category_type', 'item_category_subtype', 'item_cnt_month']]

In [17]:
train

Unnamed: 0,date_block_num,shop_id,item_id,month,days_in_month,item_category_id,shop_city,shop_type,item_category_type,item_category_subtype,item_cnt_month_
0,18,2,32,7,31,40,0,6,10,4,1.0
1,18,2,482,7,31,73,0,6,14,0,1.0
2,18,2,491,7,31,73,0,6,14,0,1.0
3,18,2,786,7,31,49,0,6,11,37,1.0
4,18,2,791,7,31,73,0,6,14,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
8995387,33,36,12230,10,31,78,14,5,14,43,0.0
8995388,33,36,12733,10,31,76,14,5,14,30,0.0
8995389,33,36,13092,10,31,36,14,5,8,15,0.0
8995390,33,36,16797,10,31,78,14,5,14,43,0.0


In [35]:
cat_features = ['month',
                'shop_id',
                'shop_city',
                'shop_type',
                'item_category_id',
                'item_category_type',
                'item_category_subtype',
                'days_in_month'
               ]

In [50]:
train['date_block_num'] = train['date_block_num'].astype(int)
train['shop_id'] = train['shop_id'].astype(int)
train['item_id'] = train['item_id'].astype(int)
train['month'] = train['month'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['date_block_num'] = train['date_block_num'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['shop_id'] = train['shop_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['item_id'] = train['item_id'].astype(int)
A value is trying to be set on a copy of a slic

In [51]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3663389 entries, 0 to 3663388
Data columns (total 11 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   date_block_num         int64  
 1   shop_id                int64  
 2   item_id                int64  
 3   month                  int64  
 4   days_in_month          int64  
 5   item_category_id       int64  
 6   shop_city              int64  
 7   shop_type              int64  
 8   item_category_type     int64  
 9   item_category_subtype  int64  
 10  item_cnt_month         float64
dtypes: float64(1), int64(10)
memory usage: 307.4 MB


In [52]:
import lightgbm as lgb


params = {'metric': 'rmse',
          'objective': 'mse',
          'num_leaves': 255,
          'learning_rate': 0.005,
          'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5,
          'force_col_wise' : True,
          'random_state': 10}

# Prepare training and validation datasets with categorical features
train_data = lgb.Dataset(
    train[(train['date_block_num']  < 33)],

    label=train[(train['date_block_num'] < 33)]['item_cnt_month'],
    categorical_feature=cat_features
)

valid_data = lgb.Dataset(
    train[train['date_block_num'] == 33],
    label=train[train['date_block_num'] == 33]['item_cnt_month'],
    categorical_feature=cat_features,
    reference=train_data 
)




In [43]:
valid_data.data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 218904 entries, 3444485 to 3663388
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   date_block_num         218904 non-null  object 
 1   shop_id                218904 non-null  object 
 2   item_id                218904 non-null  object 
 3   month                  218904 non-null  object 
 4   days_in_month          218904 non-null  int64  
 5   item_category_id       218904 non-null  int64  
 6   shop_city              218904 non-null  int64  
 7   shop_type              218904 non-null  int64  
 8   item_category_type     218904 non-null  int64  
 9   item_category_subtype  218904 non-null  int64  
 10  item_cnt_month         218904 non-null  float64
dtypes: float64(1), int64(6), object(4)
memory usage: 20.0+ MB


In [53]:
# Train model
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    num_boost_round=1500,
    valid_sets=[train_data, valid_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
)

[LightGBM] [Info] Total Bins 546
[LightGBM] [Info] Number of data points in the train set: 3444485, number of used features: 11
[LightGBM] [Info] Start training from score 0.292880
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.745843	valid_1's rmse: 0.655132
[200]	training's rmse: 0.504707	valid_1's rmse: 0.443528
[300]	training's rmse: 0.334136	valid_1's rmse: 0.294522


KeyboardInterrupt: 

In [None]:
df = proc.group_by_month(splits[0][0])
print(df)

proc.add_revenue(df)

        date_block_num  shop_id  item_id  item_cnt_month_  purch_cnt_month
0                    0        2       27              1.0                1
1                    0        2       33              1.0                1
2                    0        2      317              1.0                1
3                    0        2      438              1.0                1
4                    0        2      471              2.0                2
...                ...      ...      ...              ...              ...
990148              23       59    22076              2.0                2
990149              23       59    22087              8.0                6
990150              23       59    22088              9.0                7
990151              23       59    22091             10.0                5
990152              23       59    22092              3.0                3

[990153 rows x 5 columns]


KeyError: 'item_price'

In [None]:
def preprocess_dataframe(df, proc):
    df = proc.group_by_month(df)

    df = proc.add_month_and_days(df, month_column='date_block_num')

    df = proc.add_cat_features(df)

    cat_features = ['shop_city', 'shop_type', 'item_category_type', 'item_category_subtype']
    df = proc.encode_categorical_features(df, cat_features=cat_features)


    

    df = proc.add_lag_features(df, 
                               time_column = 'date_block_num', 
                               feature_column='purch_cnt_month', 
                               group_level=['shop_id', 'item_cnt_month_', 'item_id'], 
                               lags=[1,2,3])
    
    df = proc.add_lag_features(df, 
                            time_column = 'date_block_num', 
                            feature_column='purch_cnt_month', 
                            group_level=['shop_id', 'item_cnt_month_', 'item_id'], 
                            lags=[1,2])
    
    df = df.drop(columns=['item_name', 'item_category_name', 'shop_name', 'purch_cnt_month'], errors='ignore')

    return df

In [None]:
preprocess_dataframe(splits[0][0], proc)

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month_,month,days_in_month,item_category_id,item_category_type,item_category_subtype,shop_city,shop_type,purch_cnt_month_lag_1_x,purch_cnt_month_lag_2_x,purch_cnt_month_lag_3,purch_cnt_month_lag_1_y,purch_cnt_month_lag_2_y
0,0,2,27,1.0,1,31,19,5,10,0,6,0.0,0.0,0.0,0.0,0.0
1,0,2,33,1.0,1,31,37,11,1,0,6,0.0,0.0,0.0,0.0,0.0
2,0,2,317,1.0,1,31,45,12,21,0,6,0.0,0.0,0.0,0.0,0.0
3,0,2,438,1.0,1,31,45,12,21,0,6,0.0,0.0,0.0,0.0,0.0
4,0,2,471,2.0,1,31,49,12,35,0,6,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990148,23,59,22076,2.0,12,31,30,8,50,27,6,0.0,0.0,0.0,0.0,0.0
990149,23,59,22087,8.0,12,31,83,19,59,27,6,0.0,0.0,0.0,0.0,0.0
990150,23,59,22088,9.0,12,31,83,19,59,27,6,0.0,0.0,0.0,0.0,0.0
990151,23,59,22091,10.0,12,31,83,19,59,27,6,0.0,0.0,0.0,0.0,0.0


In [None]:
from tqdm import tqdm
prep_splits = []

for train, val, test in tqdm(splits):
    train = preprocess_dataframe(train, proc)
    val = preprocess_dataframe(val, proc)
    test = preprocess_dataframe(test, proc)
    
    prep_splits.append((train, val, test))

  0%|          | 0/9 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:41<00:00,  4.65s/it]


In [None]:
from sklearn.metrics import root_mean_squared_error
from sklearn.base import clone
from tqdm import tqdm



def get_rmse_for_model(model, prep_splits):
    models = list()
    for train_set, _, _ in tqdm(prep_splits):
        X_train = train_set.drop(columns=['item_cnt_month_'])
        y_train = train_set['item_cnt_month_']
        
        model = clone(model)
        model.fit(X_train, y_train)
        
        models.append(model)

    return validator.validate(models, prep_splits, "item_cnt_month_")
    

In [None]:
# SVR  из sklearn не использует gpu, не дождался потом пофикшу.
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

models = [LinearRegression(fit_intercept=True),
        #   "SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1),"
          RandomForestRegressor(n_estimators=5, max_depth=2, random_state=42),
          GradientBoostingRegressor(n_estimators=5, max_depth=2, random_state=42)]



for model in models:
    print(model.__class__.__name__)
    print(get_rmse_for_model(model, prep_splits))



# PS  очень противно работать с моделями из sklearn, КАК ЖЕ МЕДЛЕННО ОНИ РАБОТАЮТ
# думаю стоит наконец забыть про эту мертвую библиотеку
# мб перепистать это с норм моделями 

LinearRegression


100%|██████████| 9/9 [00:04<00:00,  2.22it/s]


              model      rmse  train_months  val_months  test_months
0  LinearRegression  2.697763        990153       37904        35057
1  LinearRegression  2.746446        983909       35057        34913
2  LinearRegression  2.969355        976864       34913        29668
3  LinearRegression  2.681369        962429       29668        29331
4  LinearRegression  2.683248        949623       29331        29783
5  LinearRegression  2.551211        936540       29783        31576
6  LinearRegression  2.705672        920825       31576        32551
7  LinearRegression  3.047690        905737       32551        28860
8  LinearRegression  2.824403        891815       28860        30268
RandomForestRegressor


100%|██████████| 9/9 [00:19<00:00,  2.21s/it]


                   model      rmse  train_months  val_months  test_months
0  RandomForestRegressor  2.723645        990153       37904        35057
1  RandomForestRegressor  2.760198        983909       35057        34913
2  RandomForestRegressor  2.970060        976864       34913        29668
3  RandomForestRegressor  2.689123        962429       29668        29331
4  RandomForestRegressor  2.678613        949623       29331        29783
5  RandomForestRegressor  2.558840        936540       29783        31576
6  RandomForestRegressor  2.698434        920825       31576        32551
7  RandomForestRegressor  3.095327        905737       32551        28860
8  RandomForestRegressor  3.069812        891815       28860        30268
GradientBoostingRegressor


100%|██████████| 9/9 [00:27<00:00,  3.10s/it]

                       model      rmse  train_months  val_months  test_months
0  GradientBoostingRegressor  2.689677        990153       37904        35057
1  GradientBoostingRegressor  2.740385        983909       35057        34913
2  GradientBoostingRegressor  2.983503        976864       34913        29668
3  GradientBoostingRegressor  2.672377        962429       29668        29331
4  GradientBoostingRegressor  2.674052        949623       29331        29783
5  GradientBoostingRegressor  2.541664        936540       29783        31576
6  GradientBoostingRegressor  2.704927        920825       31576        32551
7  GradientBoostingRegressor  3.058925        905737       32551        28860
8  GradientBoostingRegressor  2.822756        891815       28860        30268





In [None]:
# попробуем что-то поприкольней optuna + xgboost

import xgboost as xgb
import optuna
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import root_mean_squared_error


def create_objective(X_train, y_train, X_val, y_val):
    def objective(trial):
        param = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'eta': trial.suggest_float('eta', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True)
        }

        model = xgb.XGBRegressor(**param)
        model.fit(X_train, y_train,
                eval_set=[(X_val, y_val)],
                verbose=False)

        preds = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, preds)
        
        return rmse
    return objective

In [None]:
models = list()
for train, val, test in tqdm(prep_splits):
    X_train = train.drop(columns=['item_cnt_month_'])
    y_train = train['item_cnt_month_']
    X_val = val.drop(columns=['item_cnt_month_'])
    y_val = val['item_cnt_month_']

    objective = create_objective(X_train, y_train, X_val, y_val)
    
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=10)

    model = xgb.XGBRegressor(**study.best_params).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    models.append(model)




  0%|          | 0/9 [00:00<?, ?it/s]

[I 2025-08-21 11:43:36,137] A new study created in memory with name: no-name-5548d2ef-9621-4cd6-810a-071e9959439f
[I 2025-08-21 11:43:37,915] Trial 0 finished with value: 2.485625545019066 and parameters: {'eta': 0.25163823716055983, 'max_depth': 5, 'subsample': 0.7187495845368105, 'colsample_bytree': 0.7081492367563205, 'min_child_weight': 7, 'lambda': 0.0032132950600982464, 'alpha': 4.607483858278837e-06}. Best is trial 0 with value: 2.485625545019066.
[I 2025-08-21 11:43:39,701] Trial 1 finished with value: 2.4484782242282512 and parameters: {'eta': 0.20643986791642144, 'max_depth': 7, 'subsample': 0.6482272578924979, 'colsample_bytree': 0.7076939759082905, 'min_child_weight': 10, 'lambda': 5.564197972996599e-06, 'alpha': 4.824752629196655e-06}. Best is trial 1 with value: 2.4484782242282512.
[I 2025-08-21 11:43:40,753] Trial 2 finished with value: 2.8835247096453127 and parameters: {'eta': 0.024813698585918732, 'max_depth': 3, 'subsample': 0.7710528448677052, 'colsample_bytree': 0.

In [None]:
validation_results = validator.validate(models, prep_splits, "item_cnt_month_")
# думаю достойно попробую засабмитить 

validation_results

Unnamed: 0,model,rmse,train_months,val_months,test_months
0,XGBRegressor,2.176334,990153,37904,35057
1,XGBRegressor,2.205024,983909,35057,34913
2,XGBRegressor,2.540444,976864,34913,29668
3,XGBRegressor,2.123136,962429,29668,29331
4,XGBRegressor,2.123818,949623,29331,29783
5,XGBRegressor,2.057477,936540,29783,31576
6,XGBRegressor,2.080635,920825,31576,32551
7,XGBRegressor,2.542696,905737,32551,28860
8,XGBRegressor,2.324976,891815,28860,30268


In [None]:
train_monthly_agg = data_storage['train'].groupby(['shop_id', 'item_id', 'date_block_num']).agg(
    item_cnt_month=('item_cnt_day', 'sum')
).reset_index()

In [None]:
test = data_storage["test"]
test["date_block_num"] = 34

In [None]:
import pandas as pd
import numpy as np

def add_lags_to_test(test: pd.DataFrame, n_lags: list = [1, 2, 3, 6, 12]) -> pd.DataFrame:

    final_test_df = test.copy()
    final_test_df['date_block_num'] = 34
    
    for lag in n_lags:
        lag_df = train_monthly_agg.copy()
        
        lag_df['date_block_num'] += lag
        lag_col_name = f'item_cnt_month_lag_{lag}'
        lag_df.rename(columns={'item_cnt_month': lag_col_name}, inplace=True)
        
        lag_df.drop_duplicates(subset=['shop_id', 'item_id', 'date_block_num'], inplace=True)
        
        final_test_df = pd.merge(final_test_df, lag_df, on=['shop_id', 'item_id', 'date_block_num'], how='left')

    lag_cols = [f'item_cnt_month_lag_{lag}' for lag in n_lags]
    final_test_df[lag_cols] = final_test_df[lag_cols].fillna(0)
    
    return final_test_df

In [None]:
df = add_lags_to_test(test, n_lags=list(range(1, 13)))
proc.add_item_price_to_test(data_storage['train'], df)
df = proc.add_month_and_days(df, month_column='date_block_num')
df = proc.add_cat_features(df)

cat_features = ['shop_city', 'shop_type', 'item_category_type', 'item_category_subtype']
df = proc.encode_categorical_features(df, cat_features=cat_features)

df = df.drop(columns=['item_name', 'item_category_name', 'shop_name'], errors='ignore')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["item_price"].fillna(test_df["item_price"].mean(), inplace=True)
