# Data loading

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from itertools import product
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import metrics
import lightgbm as lgb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

In [2]:
import os
import boto3
from dotenv import load_dotenv
import io 
import pandas as pd

load_dotenv()

YC_ACCESS_KEY_ID = os.getenv("YC_ACCESS_KEY_ID")
YC_SECRET_ACCESS_KEY = os.getenv("YC_SECRET_ACCESS_KEY")
YC_ENDPOINT_URL = os.getenv("YC_ENDPOINT_URL")
YC_BUCKET_NAME = os.getenv("YC_BUCKET_NAME")

In [3]:
session = boto3.session.Session()
s3_client = session.client(
    service_name='s3',
    endpoint_url=YC_ENDPOINT_URL,
    aws_access_key_id=YC_ACCESS_KEY_ID,
    aws_secret_access_key=YC_SECRET_ACCESS_KEY
)

In [4]:
import gzip

file_names = ["item_categories.csv.gzip", "items.csv.gzip", "sample_submission.csv.gzip", "shops.csv.gzip", "test.csv.gzip", "train.csv.gzip"]
data_location = "filtered_data/"

data_storage = dict()
for file_name in file_names:
    response = s3_client.get_object(Bucket=YC_BUCKET_NAME, Key=f'{data_location}{file_name}')
    data_storage[file_name[: len(file_name) - 9]] = pd.read_csv(io.BytesIO(response['Body'].read()), compression='gzip')

data_storage["train"]['date'] = pd.to_datetime(data_storage["train"]['date'], format = '%Y-%m-%d')



# Data preporation tools

In [5]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

from src.data_preprocessor import DataPreprocessor

proc = DataPreprocessor(data_storage)

In [6]:
sales           = data_storage['train']
items           = data_storage['items']
item_categories = data_storage['item_categories']
shops           = data_storage['shops']

In [7]:
def create_testlike_train(df):
    matrix = []
    min_date = df['date'].min()
    for i in range(df['date_block_num'].min(), df['date_block_num'].max()+1):
        shops = df[df['date_block_num'] == i]['shop_id'].unique()
        items = df[df['date_block_num'] == i]['item_id'].unique()
        month_start = min_date + pd.tseries.offsets.DateOffset(months = i)
        matrix.append( np.array( list(product([i],[month_start],shops,items))))
        
    df_new = pd.DataFrame(np.vstack(matrix),columns = ['date_block_num','month_start','shop_id','item_id'])
    pivot = pd.pivot_table(df, 
                            values = ['item_cnt_day'], 
                            index = ['date_block_num','shop_id','item_id'], 
                            aggfunc = 'sum').reset_index()
    pivot2 = pd.pivot_table(df[df['item_cnt_day']>0], 
                            values = ['item_cnt_day'], 
                            index = ['date_block_num','shop_id','item_id'], 
                            aggfunc = 'count').reset_index()
    pivot2.rename(columns={'item_cnt_day': 'purch_cnt_month'}, inplace=True)
    
    df_new = df_new.merge(right = pivot, how = 'left', on = ['date_block_num','shop_id','item_id'], sort = False)
    df_new = df_new.merge(right = pivot2, how = 'left', on = ['date_block_num','shop_id','item_id'], sort = False)
    
    df_new.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)
    df_new['item_cnt_month'] = df_new['item_cnt_month'].clip(0,20)
    df_new['item_cnt_month'].fillna(0, inplace=True)

    df_new['date_block_num'] = df_new['date_block_num'].astype('int8')
    df_new['shop_id'] = df_new['shop_id'].astype('int8')
    df_new['item_id'] = df_new['item_id'].astype('int16')
    df_new['item_cnt_month'] = df_new['item_cnt_month'].astype('float32')
    return df_new

In [8]:
%%time
df = create_testlike_train(sales)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_new['item_cnt_month'].fillna(0, inplace=True)


CPU times: user 16.5 s, sys: 1.94 s, total: 18.4 s
Wall time: 18.5 s


In [9]:
df = proc.add_month_and_days(df)

df = proc.add_cat_features(df)

cat_features = ['shop_city', 'shop_type', 'item_category_type', 'item_category_subtype']
df = proc.encode_categorical_features(df, cat_features=cat_features)
df = df[df['date_block_num'] > 20]

In [13]:
columns_to_exclude = [
                      'item_cnt_month',
                      'purch_cnt_month',
                      'month_start',
                      'shop_name',
                      'item_category_name',
                      'item_name'
                     ]
cat_features = ['month',
                'shop_id',
                'shop_city',
                'shop_type',
                'item_category_id',
                'item_category_type',
                'item_category_subtype',
                'days_in_month'
               ]

train_data = df.drop(columns_to_exclude, axis=1)

# Обучение модели 

In [None]:
from sklearn.metrics import root_mean_squared_error

def get_objective(splits, validator, categorical_features):
    def objective(trial):
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
            'random_state': 42,
            'n_jobs': -1
        }
    
        models = list()
        for train_set, val_set, test_set in splits:
            X_train = train_set.drop(columns=['item_cnt_month_'])
            y_train = train_set['item_cnt_month_']
            
            X_val = val_set.drop(columns=['item_cnt_month_'])
            y_val = val_set['item_cnt_month_']
            

            model = lgb.LGBMRegressor(**params)
            model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            callbacks=[lgb.early_stopping(10, verbose=False)],
            categorical_feature=categorical_features
            )
            
            models.append(model)

        return validator.validate(models, splits, "item_cnt_month")['rmse'].mean()

    return objective


In [None]:
wrapped_objective = get_objective(X_train, X_val, y_train, y_val, categorical_features)

# Создаем Optuna study и запускаем оптимизацию
# Теперь optimize() будет вызывать wrapped_objective
study = optuna.create_study(direction='minimize')
study.optimize(wrapped_objective, n_trials=50)

# Выводим лучшие гиперпараметры
print("Лучшие гиперпараметры:")
print(study.best_params)

In [None]:
params = {'metric': 'rmse',
          'objective': 'mse',
          'num_leaves': 255,
          'learning_rate': 0.005,
          'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5,
          'force_col_wise' : True,
          'random_state': 10}

train_data = lgb.Dataset(
    df[(df['date_block_num'] >= 19) & (df['date_block_num'] < 33)].drop(columns_to_exclude, axis=1),
    label=df[(df['date_block_num'] >= 19) & (df['date_block_num'] < 33)]['item_cnt_month'],
    categorical_feature=cat_features
)

valid_data = lgb.Dataset(
    df[df['date_block_num'] == 33].drop(columns_to_exclude, axis=1),
    label=df[df['date_block_num'] == 33]['item_cnt_month'],
    categorical_feature=cat_features,
    reference=train_data
)

lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    num_boost_round=1500,
    valid_sets=[train_data, valid_data],
    callbacks=[lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation(100)]
)

[LightGBM] [Info] Total Bins 514
[LightGBM] [Info] Number of data points in the train set: 2711610, number of used features: 10
[LightGBM] [Info] Start training from score 0.295886
Training until validation scores don't improve for 10 rounds
[100]	training's rmse: 1.05425	valid_1's rmse: 0.921306


KeyboardInterrupt: 

# Validation schema

In [14]:
from src.validation_schema import TimeSeriesRollingValidator

validator = TimeSeriesRollingValidator(train_data, 'date_block_num', train_window=8, test_window=1)


In [15]:
splits = validator.split_data_rolling()

In [21]:
splits[0][0]

Unnamed: 0,date_block_num,shop_id,item_id,month,days_in_month,item_category_id,item_category_type,item_category_subtype,shop_city,shop_type
5589650,21,26,6183,10,31,24,5,17,12,6
5749133,21,50,9175,10,31,43,12,21,22,6
5749134,21,50,8598,10,31,55,13,2,22,6
5749135,21,50,8599,10,31,55,13,2,22,6
5749136,21,50,8605,10,31,40,11,4,22,6
...,...,...,...,...,...,...,...,...,...,...
7326636,28,14,18925,5,31,40,11,4,7,6
7326631,28,14,18701,5,31,55,13,2,7,6
7326637,28,14,18994,5,31,40,11,4,7,6
7326638,28,14,1851,5,31,23,5,16,7,6


In [None]:
from sklearn.linear_model import LinearRegression
models = list()
for train_set, val_set, test_set in splits:
    X_train = train_set.drop(columns=['item_cnt_month'])
    y_train = train_set['item_cnt_month']
    
    X_val = val_set.drop(columns=['item_cnt_month'])
    y_val = val_set['item_cnt_month']
    
    X_test = test_set.drop(columns=['item_cnt_month'])
    y_test = test_set['item_cnt_month']
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    models.append(model)

TypeError: float() argument must be a string or a real number, not 'Timestamp'

[LightGBM] [Info] Total Bins 518
[LightGBM] [Info] Number of data points in the train set: 3191064, number of used features: 10
[LightGBM] [Info] Start training from score 0.295206
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.05694	valid_1's rmse: 0.921393


KeyboardInterrupt: 