In [1]:
import numpy as np
import pandas as pd
from typing import Generator, Tuple
from sklearn.linear_model import LinearRegression
from scripts import validation as tsvm

In [2]:
validator = tsvm.ValidationTimeSeriesSplit(
    min_train_size=3,
    val_size=1,
    gap=0,
    test_size=1
)

In [3]:
model = tsvm.XGBModel(validator=validator, params={
            'n_estimators': 1000,
            'learning_rate': 0.1,
            'max_depth': 8,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'early_stopping_rounds': 100,
            'eval_metric': 'rmse',
            'enable_categorical': True
})

In [4]:
da = pd.read_csv('sales_train_complete.csv')
kaggle_test = pd.read_csv('../competitive-data-science-predict-future-sales/test.csv')

In [5]:
df_monthly = da.groupby(['date__month', 'shop_id', 'item_id']).agg({
    'item_cnt_day': 'sum',          
    'item_price': 'last',           
    'item_category_id': 'first',    
    'shop_name': 'first',           
    'date__day_of_month': 'count',  
    'date__month_of_year': 'first',  
    'date__year': 'first'           
}).reset_index()

df_monthly = df_monthly.rename(columns={
    'item_cnt_day': 'item_cnt_month',
    'date__day_of_month': 'sales_days_in_month'
})

if 'date' in df_monthly.columns:
    df_monthly = df_monthly.drop(columns=['date'])

df_monthly = df_monthly[[
    'date__month',
    'shop_id',
    'item_id',
    'item_price',
    'item_cnt_month',
    'item_category_id',
    'shop_name',
    'sales_days_in_month',
    'date__month_of_year',
    'date__year'
]]

In [6]:
df_monthly

Unnamed: 0,date__month,shop_id,item_id,item_price,item_cnt_month,item_category_id,shop_name,sales_days_in_month,date__month_of_year,date__year
0,0,2,33,499.0,1.0,37,"Адыгея ТЦ ""Мега""",1,1,2013
1,0,2,482,3300.0,1.0,73,"Адыгея ТЦ ""Мега""",1,1,2013
2,0,2,491,600.0,1.0,73,"Адыгея ТЦ ""Мега""",1,1,2013
3,0,2,839,3300.0,1.0,73,"Адыгея ТЦ ""Мега""",1,1,2013
4,0,2,1007,449.0,3.0,67,"Адыгея ТЦ ""Мега""",3,1,2013
...,...,...,...,...,...,...,...,...,...,...
540592,33,59,22087,119.0,6.0,83,"Ярославль ТЦ ""Альтаир""",3,10,2015
540593,33,59,22088,119.0,2.0,83,"Ярославль ТЦ ""Альтаир""",2,10,2015
540594,33,59,22091,179.0,1.0,83,"Ярославль ТЦ ""Альтаир""",1,10,2015
540595,33,59,22100,629.0,1.0,42,"Ярославль ТЦ ""Альтаир""",1,10,2015


In [7]:
def prepare_test_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
    """
    Enhance test data with features from last available training month
    """
    last_month = train_df['date__month'].max()
    last_month_data = train_df[train_df['date__month'] == last_month]
    
    item_features = train_df[['item_id', 'item_category_id']].drop_duplicates()
    
    test_enhanced = test_df.merge(
        last_month_data[['shop_id', 'item_id', 'item_price']],
        on=['shop_id', 'item_id'],
        how='left'
    )
    
    last_prices = train_df.groupby('item_id')['item_price'].last().reset_index()
    test_enhanced = test_enhanced.merge(
        last_prices,
        on='item_id',
        how='left',
        suffixes=('', '_last')
    )
    test_enhanced['item_price'] = test_enhanced['item_price'].fillna(test_enhanced['item_price_last'])
    
    test_enhanced = test_enhanced.merge(
        item_features,
        on='item_id',
        how='left'
    )
    
    test_enhanced['date__month'] = last_month + 1
    test_enhanced['date__year'] = last_month_data['date__year'].iloc[0]
    test_enhanced['date__month_of_year'] = (last_month % 12) + 2
    
    return test_enhanced[[
        'ID',
        'date__month',
        'shop_id',
        'item_id',
        'item_price',
        'item_category_id',
        'date__month_of_year',
        'date__year'
    ]]

In [8]:
prepared_test_df = prepare_test_data(df_monthly, kaggle_test)
df_monthly.drop(columns=['sales_days_in_month', 'shop_name'], inplace=True)
prepared_df = prepared_test_df.drop(columns='ID')
prepared_df

Unnamed: 0,date__month,shop_id,item_id,item_price,item_category_id,date__month_of_year,date__year
0,34,5,5037,1499.0,19.0,11,2015
1,34,5,5320,,,11,2015
2,34,5,5233,1199.0,19.0,11,2015
3,34,5,5232,1199.0,23.0,11,2015
4,34,5,5268,,,11,2015
...,...,...,...,...,...,...,...
214195,34,45,18454,99.0,55.0,11,2015
214196,34,45,16188,1359.0,64.0,11,2015
214197,34,45,15757,229.0,55.0,11,2015
214198,34,45,19648,79.2,40.0,11,2015


In [9]:
df_monthly

Unnamed: 0,date__month,shop_id,item_id,item_price,item_cnt_month,item_category_id,date__month_of_year,date__year
0,0,2,33,499.0,1.0,37,1,2013
1,0,2,482,3300.0,1.0,73,1,2013
2,0,2,491,600.0,1.0,73,1,2013
3,0,2,839,3300.0,1.0,73,1,2013
4,0,2,1007,449.0,3.0,67,1,2013
...,...,...,...,...,...,...,...,...
540592,33,59,22087,119.0,6.0,83,10,2015
540593,33,59,22088,119.0,2.0,83,10,2015
540594,33,59,22091,179.0,1.0,83,10,2015
540595,33,59,22100,629.0,1.0,42,10,2015


In [10]:
# Some tests for splits

splits, test_idx = validator.split(df_monthly)

is_valid_splits = validator.validate_splits(df_monthly, splits)

has_minimum_data = validator.check_minimum_data(df_monthly)

features = ['shop_id', 'item_id', 'item_price']
target = 'item_cnt_month'
has_leakage = validator.check_target_leakage(features, target, df_monthly)

is_validation_sufficient, ks = validator.check_validation_sufficiency(df_monthly)


is_data_adequate = validator.check_data_adequacy(df_monthly)
print(f"\tVALIDATION:\nAre splits valid? {is_valid_splits}\nHas enough data for validation? {has_minimum_data}\nIs there target leakage? {has_leakage}\nIs validation representative? {is_validation_sufficient}\nIs there enough data for stable validation? {is_data_adequate}")

	VALIDATION:
Are splits valid? True
Has enough data for validation? True
Is there target leakage? True
Is validation representative? True
Is there enough data for stable validation? True


In [11]:
model.fit(df_monthly)

Window 1 - Train: 0 to 24310 | Val: 24311 to 32105 | Best Iteration: 9 | RMSE: 2.7905
Window 2 - Train: 0 to 32105 | Val: 32106 to 40000 | Best Iteration: 31 | RMSE: 2.6094
Window 3 - Train: 0 to 40000 | Val: 40001 to 48481 | Best Iteration: 37 | RMSE: 2.3529
Window 4 - Train: 0 to 48481 | Val: 48482 to 57568 | Best Iteration: 311 | RMSE: 2.1172
Window 5 - Train: 0 to 57568 | Val: 57569 to 67303 | Best Iteration: 273 | RMSE: 2.4693
Window 6 - Train: 0 to 67303 | Val: 67304 to 76596 | Best Iteration: 112 | RMSE: 2.9411
Window 7 - Train: 0 to 76596 | Val: 76597 to 86665 | Best Iteration: 223 | RMSE: 3.9224
Window 8 - Train: 0 to 86665 | Val: 86666 to 97686 | Best Iteration: 8 | RMSE: 3.4474
Window 9 - Train: 0 to 97686 | Val: 97687 to 112524 | Best Iteration: 364 | RMSE: 3.8842
Window 10 - Train: 0 to 112524 | Val: 112525 to 125241 | Best Iteration: 41 | RMSE: 2.7312
Window 11 - Train: 0 to 125241 | Val: 125242 to 137640 | Best Iteration: 499 | RMSE: 2.4031
Window 12 - Train: 0 to 137640

<scripts.validation_1.XGBModel at 0x1a95efa42f0>

In [12]:
predictions = model.predict(prepared_df)

In [13]:
kaggle_test['item_cnt_month'] = predictions
kaggle_test.drop(columns=['shop_id','item_id'], inplace=True)

In [14]:
kaggle_test.to_csv('test_submission.csv', index=False)
