In [1]:
#Library imports and environment setup
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

BASE = r"C:\Users\danao\Downloads\competitive-data-science-predict-future-sales"
TEST_BLOCK = 34

In [2]:
#Load CSV files and parse dates
sales = pd.read_csv(BASE + r"\sales_train.csv")
items = pd.read_csv(BASE + r"\items.csv")
shops = pd.read_csv(BASE + r"\shops.csv")
cats  = pd.read_csv(BASE + r"\item_categories.csv")
test  = pd.read_csv(BASE + r"\test.csv")

sales['date'] = pd.to_datetime(sales['date'], format='%d.%m.%Y')

In [3]:
#Aggregate daily sales to monthly shop–item level
monthly = (
    sales
    .groupby(['date_block_num','shop_id','item_id'], as_index=False)
    .agg({
        'item_cnt_day':'sum',
        'item_price':'mean'
    })
    .rename(columns={'item_cnt_day':'item_cnt_month',
                     'item_price':'item_price_month'})
)

monthly['item_cnt_month'] = monthly['item_cnt_month'].clip(0,20)

In [4]:
#Build shop–item–month grid to block 32
unique_shops = sales['shop_id'].unique()
unique_items = pd.concat([
    sales['item_id'],
    test['item_id']
]).unique()

grid = []

for block in range(0,33):   # Only build up to 33
    for shop in unique_shops:
        block_items = monthly[(monthly['date_block_num']==block) &
                              (monthly['shop_id']==shop)]['item_id'].unique()
        combined_items = np.union1d(block_items, test['item_id'].unique())
        grid.append(np.array(
            np.meshgrid(np.array([block]), np.array([shop]), combined_items)
        ).T.reshape(-1,3))

grid = np.vstack(grid)

full = pd.DataFrame(grid, columns=['date_block_num','shop_id','item_id'])
full = full.astype({'date_block_num':'int16','shop_id':'int16','item_id':'int32'})

print(full.shape)

(11000446, 3)


In [5]:
#Merge monthly sales into the full grid
full = full.merge(monthly, on=['date_block_num','shop_id','item_id'], how='left')

print("Complete")

Complete


In [6]:
#Calendar features and missing prices
full['month'] = full['date_block_num'] % 12
full['year']  = full['date_block_num'] // 12
full['item_price_month'] = full['item_price_month'].fillna(0)

In [7]:
#Apply lag feature function
def add_lag(df, cols, target, lags):
    df = df.sort_values(cols + ['date_block_num'])
    for lag in lags:
        df[f"{target}_lag_{lag}"] = (
            df.groupby(cols)[target].shift(lag)
        )
    return df

full = add_lag(full, ['shop_id','item_id'], 'item_cnt_month', [1,2,3])
full = add_lag(full, ['item_id'], 'item_price_month', [1])

In [8]:
#Compute price trend feature
full['price_trend'] = full['item_price_month'] - full['item_price_month_lag_1']
full['price_trend'] = full['price_trend'].fillna(0)

In [9]:
#Revenue features and shop revenue lags
sales['revenue'] = sales['item_price'] * sales['item_cnt_day']

shop_rev = (
    sales.groupby(['date_block_num','shop_id'])['revenue'].sum().reset_index()
)

full = full.merge(shop_rev, on=['date_block_num','shop_id'], how='left')

full = add_lag(full, ['shop_id'], 'revenue', [1])

In [10]:
#Attach item categories and mean-encoded sales features
monthly = monthly.merge(
    items[['item_id', 'item_category_id']],
    on='item_id',
    how='left'
)

item_mean = (
    monthly.groupby('item_id')['item_cnt_month']
    .mean()
    .rename('item_mean')
)

shop_mean = (
    monthly.groupby('shop_id')['item_cnt_month']
    .mean()
    .rename('shop_mean')
)

cat_mean = (
    monthly.groupby('item_category_id')['item_cnt_month']
    .mean()
    .rename('cat_mean')
)
item_mean = monthly.groupby('item_id')['item_cnt_month'].mean().rename('item_mean')
shop_mean = monthly.groupby('shop_id')['item_cnt_month'].mean().rename('shop_mean')
cat_mean  = monthly.groupby('item_category_id')['item_cnt_month'].mean().rename('cat_mean')

full = full.merge(items[['item_id','item_category_id']], on='item_id', how='left')
full = full.merge(item_mean, on='item_id', how='left')
full = full.merge(shop_mean, on='shop_id', how='left')
full = full.merge(cat_mean, on='item_category_id', how='left')

for col in full.columns:
    if full[col].isna().any():
        full[col] = full[col].fillna(0)

In [11]:
#Create train/validation split and feature matrices
train = full[full['date_block_num'] < 33]
val   = full[full['date_block_num'] == 32]

TARGET = 'item_cnt_month'
features = [c for c in full.columns if c not in ['item_cnt_month', 'date_block_num']]

X_train, y_train = train[features], train[TARGET]
X_val,   y_val   = val[features],   val[TARGET]

print("Complete")

Complete


In [32]:
#Define evaluation metric and train XGBoost model
def rmse_clip(y_true, y_pred):
    y_true = np.clip(y_true, 0, 20)
    y_pred = np.clip(y_pred, 0, 20)
    return np.sqrt(mean_squared_error(y_true, y_pred))

recent_train = train[train['date_block_num'] >= 12]  # keep last ~2 years
X_train = recent_train[features]
y_train = recent_train['item_cnt_month']

print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)

model = XGBRegressor(
    max_depth=8,
    min_child_weight=10,
    n_estimators=1600,
    learning_rate=0.02,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha = 0.2,
    reg_lambda = 1.2,
    objective='reg:squarederror',
    n_jobs=4,
    tree_method='hist'
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=50    #This kernel takes a long time to run so this function just helps to see the progress
)

pred = model.predict(X_val)
print("VAL RMSE:", rmse_clip(y_val, pred))

Train shape: (6797997, 16)
Val shape: (308782, 16)
[0]	validation_0-rmse:0.96690
[50]	validation_0-rmse:0.58364
[100]	validation_0-rmse:0.49632
[150]	validation_0-rmse:0.47050
[200]	validation_0-rmse:0.45805
[250]	validation_0-rmse:0.45087
[300]	validation_0-rmse:0.44469
[350]	validation_0-rmse:0.43941
[400]	validation_0-rmse:0.43468
[450]	validation_0-rmse:0.43105
[500]	validation_0-rmse:0.42762
[550]	validation_0-rmse:0.42438
[600]	validation_0-rmse:0.42121
[650]	validation_0-rmse:0.41856
[700]	validation_0-rmse:0.41575
[750]	validation_0-rmse:0.41307
[800]	validation_0-rmse:0.41052
[850]	validation_0-rmse:0.40826
[900]	validation_0-rmse:0.40638
[950]	validation_0-rmse:0.40451
[1000]	validation_0-rmse:0.40265
[1050]	validation_0-rmse:0.40071
[1100]	validation_0-rmse:0.39897
[1150]	validation_0-rmse:0.39737
[1200]	validation_0-rmse:0.39575
[1250]	validation_0-rmse:0.39432
[1300]	validation_0-rmse:0.39276
[1350]	validation_0-rmse:0.39117
[1400]	validation_0-rmse:0.38942
[1450]	validati

In [34]:
#Construct test features and generate predictions
monthly_sales = (
    sales
    .groupby(['date_block_num', 'shop_id', 'item_id'], as_index=False)['item_cnt_day']
    .sum()
    .rename(columns={'item_cnt_day': 'item_cnt_month'})
)

test_features = test.copy()

test_features['date_block_num'] = TEST_BLOCK
test_features['month'] = 11      # November 2015
test_features['year'] = 2015

lags = [1, 2, 3, 6]
for lag in lags:
    lag_block = TEST_BLOCK - lag

    lag_df = (
        monthly_sales[monthly_sales['date_block_num'] == lag_block]
        [['shop_id', 'item_id', 'item_cnt_month']]
        .copy()
    )
    lag_df = lag_df.rename(
        columns={'item_cnt_month': f'item_cnt_month_lag_{lag}'}
    )

    test_features = test_features.merge(
        lag_df,
        on=['shop_id', 'item_id'],
        how='left'
    )

for lag in lags:
    col = f'item_cnt_month_lag_{lag}'
    if col in test_features.columns:
        test_features[col] = test_features[col].fillna(0)

for col in features:
    if col not in test_features.columns:
        test_features[col] = 0  # neutral default

X_test = test_features[features]

print("X_test shape:", X_test.shape)

test_pred = model.predict(X_test)
test_pred = np.clip(test_pred, 0, 20)

X_test shape: (214200, 16)


In [36]:
#Create and save kaggle submission file
submission = pd.DataFrame({
    'ID': test['ID'].astype(int),
    'item_cnt_month': test_pred
})

submission.to_csv(r"C:\Users\danao\OneDrive\Desktop\submission10.csv", index=False)
print("Saved submission")

Saved submission
