In [6]:
import os
import boto3
from dotenv import load_dotenv
import io 
import pandas as pd

load_dotenv()

YC_ACCESS_KEY_ID = os.getenv("YC_ACCESS_KEY_ID")
YC_SECRET_ACCESS_KEY = os.getenv("YC_SECRET_ACCESS_KEY")
YC_ENDPOINT_URL = os.getenv("YC_ENDPOINT_URL")
YC_BUCKET_NAME = os.getenv("YC_BUCKET_NAME")

In [7]:
session = boto3.session.Session()
s3_client = session.client(
    service_name='s3',
    endpoint_url=YC_ENDPOINT_URL,
    aws_access_key_id=YC_ACCESS_KEY_ID,
    aws_secret_access_key=YC_SECRET_ACCESS_KEY
)

In [8]:
import gzip

file_names = ["item_categories.csv.gzip", "items.csv.gzip", "sample_submission.csv.gzip", "shops.csv.gzip", "test.csv.gzip", "train.csv.gzip"]
data_location = "filtered_data/"

data_storage = dict()
for file_name in file_names:
    response = s3_client.get_object(Bucket=YC_BUCKET_NAME, Key=f'{data_location}{file_name}')
    data_storage[file_name[: len(file_name) - 9]] = pd.read_csv(io.BytesIO(response['Body'].read()), compression='gzip')

data_storage["train"]['date'] = pd.to_datetime(data_storage["train"]['date'], format = '%Y-%m-%d')



In [55]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

from src.data_preprocessor import DataPreprocessor
from src.validation_schema import TimeSeriesRollingValidator


proc = DataPreprocessor(data_storage)
validator = TimeSeriesRollingValidator(data_storage['train'],  'date_block_num', train_window=24, test_window=1)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
#splits = validator.split_data_rolling()
data_storage['train'] = proc.group_by_month(data_storage['train'])

In [46]:
data_storage['train'] = proc.add_revenue(data_storage['train'])

In [48]:
data_storage['train'] = proc.add_month_and_days(data_storage['train'])

In [50]:
data_storage['train'] = proc.add_cat_features(data_storage['train'])

cat_features = ['shop_city', 'shop_type', 'item_category_type', 'item_category_subtype']
data_storage['train'] = proc.encode_categorical_features(data_storage['train'], cat_features=cat_features)


In [51]:
data_storage['train'] 

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month_unclipped,purch_cnt_month_,item_price,item_cnt_month_,revenue,month,days_in_month,item_name,item_category_id,item_category_name,item_category_type,item_category_subtype,shop_name,shop_city,shop_type
0,0,2,27,1.0,1,2499.0,1.0,2499.0,1,31,"007 Legends [PS3, русская версия]",19,Игры - PS3,5,10,"Адыгея ТЦ ""Мега""",0,6
1,0,2,33,1.0,1,499.0,1.0,499.0,1,31,1+1 (BD),37,Кино - Blu-Ray,11,1,"Адыгея ТЦ ""Мега""",0,6
2,0,2,317,1.0,1,299.0,1.0,299.0,1,31,1С:Аудиокниги. Мединский В. Мифы о России. О р...,45,Книги - Аудиокниги 1С,12,23,"Адыгея ТЦ ""Мега""",0,6
3,0,2,438,1.0,1,299.0,1.0,299.0,1,31,1С:Аудиотеатр. Лучшие произведения русских пис...,45,Книги - Аудиокниги 1С,12,23,"Адыгея ТЦ ""Мега""",0,6
4,0,2,471,2.0,2,399.0,2.0,798.0,1,31,1С:Бухгалтерия 8 (ред.3.0) как на ладони. Изд ...,49,Книги - Методические материалы 1С,12,39,"Адыгея ТЦ ""Мега""",0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1310059,33,59,22087,6.0,3,119.0,6.0,714.0,10,31,Элемент питания DURACELL LR03-BC2,83,Элементы питания,19,64,"Ярославль ТЦ ""Альтаир""",27,6
1310060,33,59,22088,2.0,2,119.0,2.0,238.0,10,31,Элемент питания DURACELL LR06-BC2,83,Элементы питания,19,64,"Ярославль ТЦ ""Альтаир""",27,6
1310061,33,59,22091,1.0,1,179.0,1.0,179.0,10,31,Элемент питания DURACELL TURBO LR 03 2*BL,83,Элементы питания,19,64,"Ярославль ТЦ ""Альтаир""",27,6
1310062,33,59,22100,1.0,1,629.0,1.0,629.0,10,31,Энциклопедия Adventure Time,42,"Книги - Артбуки, энциклопедии",12,19,"Ярославль ТЦ ""Альтаир""",27,6


In [13]:
data_storage['train']['purch_cnt_month_unclipped'].nunique()

31

In [78]:
proc.group_by_month(splits[0][0])

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month_,purch_cnt_month,item_price
0,0,2,27,1.0,1,2499.0
1,0,2,33,1.0,1,499.0
2,0,2,317,1.0,1,299.0
3,0,2,438,1.0,1,299.0
4,0,2,471,2.0,2,399.0
...,...,...,...,...,...,...
990148,23,59,22076,2.0,2,58.0
990149,23,59,22087,8.0,6,99.0
990150,23,59,22088,9.0,7,99.0
990151,23,59,22091,10.0,5,139.0


In [69]:
df = proc.group_by_month(splits[0][0])
print(df)

proc.add_revenue(df)

        date_block_num  shop_id  item_id  item_cnt_month_  purch_cnt_month
0                    0        2       27              1.0                1
1                    0        2       33              1.0                1
2                    0        2      317              1.0                1
3                    0        2      438              1.0                1
4                    0        2      471              2.0                2
...                ...      ...      ...              ...              ...
990148              23       59    22076              2.0                2
990149              23       59    22087              8.0                6
990150              23       59    22088              9.0                7
990151              23       59    22091             10.0                5
990152              23       59    22092              3.0                3

[990153 rows x 5 columns]


KeyError: 'item_price'

In [52]:
def preprocess_dataframe(df, proc):
    df = proc.group_by_month(df)

    df = proc.add_month_and_days(df, month_column='date_block_num')

    df = proc.add_cat_features(df)

    cat_features = ['shop_city', 'shop_type', 'item_category_type', 'item_category_subtype']
    df = proc.encode_categorical_features(df, cat_features=cat_features)


    

    df = proc.add_lag_features(df, 
                               time_column = 'date_block_num', 
                               feature_column='purch_cnt_month', 
                               group_level=['shop_id', 'item_cnt_month_', 'item_id'], 
                               lags=[1,2,3])
    
    df = proc.add_lag_features(df, 
                            time_column = 'date_block_num', 
                            feature_column='purch_cnt_month', 
                            group_level=['shop_id', 'item_cnt_month_', 'item_id'], 
                            lags=[1,2])
    
    df = df.drop(columns=['item_name', 'item_category_name', 'shop_name', 'purch_cnt_month'], errors='ignore')

    return df

In [53]:
preprocess_dataframe(splits[0][0], proc)

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month_,month,days_in_month,item_category_id,item_category_type,item_category_subtype,shop_city,shop_type,purch_cnt_month_lag_1_x,purch_cnt_month_lag_2_x,purch_cnt_month_lag_3,purch_cnt_month_lag_1_y,purch_cnt_month_lag_2_y
0,0,2,27,1.0,1,31,19,5,10,0,6,0.0,0.0,0.0,0.0,0.0
1,0,2,33,1.0,1,31,37,11,1,0,6,0.0,0.0,0.0,0.0,0.0
2,0,2,317,1.0,1,31,45,12,21,0,6,0.0,0.0,0.0,0.0,0.0
3,0,2,438,1.0,1,31,45,12,21,0,6,0.0,0.0,0.0,0.0,0.0
4,0,2,471,2.0,1,31,49,12,35,0,6,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990148,23,59,22076,2.0,12,31,30,8,50,27,6,0.0,0.0,0.0,0.0,0.0
990149,23,59,22087,8.0,12,31,83,19,59,27,6,0.0,0.0,0.0,0.0,0.0
990150,23,59,22088,9.0,12,31,83,19,59,27,6,0.0,0.0,0.0,0.0,0.0
990151,23,59,22091,10.0,12,31,83,19,59,27,6,0.0,0.0,0.0,0.0,0.0


In [55]:
from tqdm import tqdm
prep_splits = []

for train, val, test in tqdm(splits):
    train = preprocess_dataframe(train, proc)
    val = preprocess_dataframe(val, proc)
    test = preprocess_dataframe(test, proc)
    
    prep_splits.append((train, val, test))

  0%|          | 0/9 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:41<00:00,  4.65s/it]


In [56]:
from sklearn.metrics import root_mean_squared_error
from sklearn.base import clone
from tqdm import tqdm



def get_rmse_for_model(model, prep_splits):
    models = list()
    for train_set, _, _ in tqdm(prep_splits):
        X_train = train_set.drop(columns=['item_cnt_month_'])
        y_train = train_set['item_cnt_month_']
        
        model = clone(model)
        model.fit(X_train, y_train)
        
        models.append(model)

    return validator.validate(models, prep_splits, "item_cnt_month_")
    

In [57]:
# SVR  из sklearn не использует gpu, не дождался потом пофикшу.
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

models = [LinearRegression(fit_intercept=True),
        #   "SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1),"
          RandomForestRegressor(n_estimators=5, max_depth=2, random_state=42),
          GradientBoostingRegressor(n_estimators=5, max_depth=2, random_state=42)]



for model in models:
    print(model.__class__.__name__)
    print(get_rmse_for_model(model, prep_splits))



# PS  очень противно работать с моделями из sklearn, КАК ЖЕ МЕДЛЕННО ОНИ РАБОТАЮТ
# думаю стоит наконец забыть про эту мертвую библиотеку
# мб перепистать это с норм моделями 

LinearRegression


100%|██████████| 9/9 [00:04<00:00,  2.22it/s]


              model      rmse  train_months  val_months  test_months
0  LinearRegression  2.697763        990153       37904        35057
1  LinearRegression  2.746446        983909       35057        34913
2  LinearRegression  2.969355        976864       34913        29668
3  LinearRegression  2.681369        962429       29668        29331
4  LinearRegression  2.683248        949623       29331        29783
5  LinearRegression  2.551211        936540       29783        31576
6  LinearRegression  2.705672        920825       31576        32551
7  LinearRegression  3.047690        905737       32551        28860
8  LinearRegression  2.824403        891815       28860        30268
RandomForestRegressor


100%|██████████| 9/9 [00:19<00:00,  2.21s/it]


                   model      rmse  train_months  val_months  test_months
0  RandomForestRegressor  2.723645        990153       37904        35057
1  RandomForestRegressor  2.760198        983909       35057        34913
2  RandomForestRegressor  2.970060        976864       34913        29668
3  RandomForestRegressor  2.689123        962429       29668        29331
4  RandomForestRegressor  2.678613        949623       29331        29783
5  RandomForestRegressor  2.558840        936540       29783        31576
6  RandomForestRegressor  2.698434        920825       31576        32551
7  RandomForestRegressor  3.095327        905737       32551        28860
8  RandomForestRegressor  3.069812        891815       28860        30268
GradientBoostingRegressor


100%|██████████| 9/9 [00:27<00:00,  3.10s/it]

                       model      rmse  train_months  val_months  test_months
0  GradientBoostingRegressor  2.689677        990153       37904        35057
1  GradientBoostingRegressor  2.740385        983909       35057        34913
2  GradientBoostingRegressor  2.983503        976864       34913        29668
3  GradientBoostingRegressor  2.672377        962429       29668        29331
4  GradientBoostingRegressor  2.674052        949623       29331        29783
5  GradientBoostingRegressor  2.541664        936540       29783        31576
6  GradientBoostingRegressor  2.704927        920825       31576        32551
7  GradientBoostingRegressor  3.058925        905737       32551        28860
8  GradientBoostingRegressor  2.822756        891815       28860        30268





In [58]:
# попробуем что-то поприкольней optuna + xgboost

import xgboost as xgb
import optuna
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import root_mean_squared_error


def create_objective(X_train, y_train, X_val, y_val):
    def objective(trial):
        param = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'eta': trial.suggest_float('eta', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True)
        }

        model = xgb.XGBRegressor(**param)
        model.fit(X_train, y_train,
                eval_set=[(X_val, y_val)],
                verbose=False)

        preds = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, preds)
        
        return rmse
    return objective

In [59]:
models = list()
for train, val, test in tqdm(prep_splits):
    X_train = train.drop(columns=['item_cnt_month_'])
    y_train = train['item_cnt_month_']
    X_val = val.drop(columns=['item_cnt_month_'])
    y_val = val['item_cnt_month_']

    objective = create_objective(X_train, y_train, X_val, y_val)
    
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=10)

    model = xgb.XGBRegressor(**study.best_params).fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    models.append(model)




  0%|          | 0/9 [00:00<?, ?it/s]

[I 2025-08-21 11:43:36,137] A new study created in memory with name: no-name-5548d2ef-9621-4cd6-810a-071e9959439f
[I 2025-08-21 11:43:37,915] Trial 0 finished with value: 2.485625545019066 and parameters: {'eta': 0.25163823716055983, 'max_depth': 5, 'subsample': 0.7187495845368105, 'colsample_bytree': 0.7081492367563205, 'min_child_weight': 7, 'lambda': 0.0032132950600982464, 'alpha': 4.607483858278837e-06}. Best is trial 0 with value: 2.485625545019066.
[I 2025-08-21 11:43:39,701] Trial 1 finished with value: 2.4484782242282512 and parameters: {'eta': 0.20643986791642144, 'max_depth': 7, 'subsample': 0.6482272578924979, 'colsample_bytree': 0.7076939759082905, 'min_child_weight': 10, 'lambda': 5.564197972996599e-06, 'alpha': 4.824752629196655e-06}. Best is trial 1 with value: 2.4484782242282512.
[I 2025-08-21 11:43:40,753] Trial 2 finished with value: 2.8835247096453127 and parameters: {'eta': 0.024813698585918732, 'max_depth': 3, 'subsample': 0.7710528448677052, 'colsample_bytree': 0.

In [60]:
validation_results = validator.validate(models, prep_splits, "item_cnt_month_")
# думаю достойно попробую засабмитить 

validation_results

Unnamed: 0,model,rmse,train_months,val_months,test_months
0,XGBRegressor,2.176334,990153,37904,35057
1,XGBRegressor,2.205024,983909,35057,34913
2,XGBRegressor,2.540444,976864,34913,29668
3,XGBRegressor,2.123136,962429,29668,29331
4,XGBRegressor,2.123818,949623,29331,29783
5,XGBRegressor,2.057477,936540,29783,31576
6,XGBRegressor,2.080635,920825,31576,32551
7,XGBRegressor,2.542696,905737,32551,28860
8,XGBRegressor,2.324976,891815,28860,30268


In [14]:
train_monthly_agg = data_storage['train'].groupby(['shop_id', 'item_id', 'date_block_num']).agg(
    item_cnt_month=('item_cnt_day', 'sum')
).reset_index()

In [15]:
test = data_storage["test"]
test["date_block_num"] = 34

In [16]:
import pandas as pd
import numpy as np

def add_lags_to_test(test: pd.DataFrame, n_lags: list = [1, 2, 3, 6, 12]) -> pd.DataFrame:

    final_test_df = test.copy()
    final_test_df['date_block_num'] = 34
    
    for lag in n_lags:
        lag_df = train_monthly_agg.copy()
        
        lag_df['date_block_num'] += lag
        lag_col_name = f'item_cnt_month_lag_{lag}'
        lag_df.rename(columns={'item_cnt_month': lag_col_name}, inplace=True)
        
        lag_df.drop_duplicates(subset=['shop_id', 'item_id', 'date_block_num'], inplace=True)
        
        final_test_df = pd.merge(final_test_df, lag_df, on=['shop_id', 'item_id', 'date_block_num'], how='left')

    lag_cols = [f'item_cnt_month_lag_{lag}' for lag in n_lags]
    final_test_df[lag_cols] = final_test_df[lag_cols].fillna(0)
    
    return final_test_df

In [17]:
df = add_lags_to_test(test, n_lags=list(range(1, 13)))
proc.add_item_price_to_test(data_storage['train'], df)
df = proc.add_month_and_days(df, month_column='date_block_num')
df = proc.add_cat_features(df)

cat_features = ['shop_city', 'shop_type', 'item_category_type', 'item_category_subtype']
df = proc.encode_categorical_features(df, cat_features=cat_features)

df = df.drop(columns=['item_name', 'item_category_name', 'shop_name'], errors='ignore')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df["item_price"].fillna(test_df["item_price"].mean(), inplace=True)
