In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from itertools import product
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import metrics
import lightgbm as lgb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

In [2]:
import os
import boto3
from dotenv import load_dotenv
import io 
import pandas as pd

load_dotenv()

YC_ACCESS_KEY_ID = os.getenv("YC_ACCESS_KEY_ID")
YC_SECRET_ACCESS_KEY = os.getenv("YC_SECRET_ACCESS_KEY")
YC_ENDPOINT_URL = os.getenv("YC_ENDPOINT_URL")
YC_BUCKET_NAME = os.getenv("YC_BUCKET_NAME")

In [3]:
session = boto3.session.Session()
s3_client = session.client(
    service_name='s3',
    endpoint_url=YC_ENDPOINT_URL,
    aws_access_key_id=YC_ACCESS_KEY_ID,
    aws_secret_access_key=YC_SECRET_ACCESS_KEY
)

In [4]:
import gzip

file_names = ["item_categories.csv.gzip", "items.csv.gzip", "sample_submission.csv.gzip", "shops.csv.gzip", "test.csv.gzip", "sales_train.csv.gzip"]
data_location = "compressed_data/"

data_storage = dict()
for file_name in file_names:
    response = s3_client.get_object(Bucket=YC_BUCKET_NAME, Key=f'{data_location}{file_name}')
    data_storage[file_name[: len(file_name) - 9]] = pd.read_csv(io.BytesIO(response['Body'].read()), compression='gzip')


In [5]:
# import the data
DATA_FOLDER = '/kaggle/input/competitive-data-science-predict-future-sales'

sales           = data_storage['sales_train']
items           = data_storage['items']
item_categories = data_storage['item_categories']
shops           = data_storage['shops']
test            = data_storage['test']

In [6]:
sales['date'] = pd.to_datetime(sales['date'], format = '%d.%m.%Y')

In [7]:
# exclude shops not in test
sales = sales[sales['shop_id'].isin(test['shop_id'].unique())]

In [8]:
sales['revenue'] = sales['item_price']*sales['item_cnt_day']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales['revenue'] = sales['item_price']*sales['item_cnt_day']


In [9]:
# cerate test-like train - add rows for all shops&items&periods product. For combinations not in original data fill 0

def create_testlike_train(df):
    matrix = []
    min_date = df['date'].min()
    for i in range(df['date_block_num'].min(), df['date_block_num'].max()+1):
        shops = df[df['date_block_num'] == i]['shop_id'].unique()
        items = df[df['date_block_num'] == i]['item_id'].unique()
        month_start = min_date + pd.tseries.offsets.DateOffset(months = i)
        matrix.append( np.array( list(product([i],[month_start],shops,items))))
    df_new = pd.DataFrame(np.vstack(matrix),columns = ['date_block_num','month_start','shop_id','item_id'])
    pivot = pd.pivot_table(df, 
                            values = ['item_cnt_day','revenue'], 
                            index = ['date_block_num','shop_id','item_id'], 
                            aggfunc = 'sum').reset_index()
    pivot2 = pd.pivot_table(df[df['item_cnt_day']>0], 
                            values = ['item_cnt_day'], 
                            index = ['date_block_num','shop_id','item_id'], 
                            aggfunc = 'count').reset_index()
    pivot2.rename(columns={'item_cnt_day': 'purch_cnt_month'}, inplace=True)
    
    df_new = df_new.merge(right = pivot, how = 'left', on = ['date_block_num','shop_id','item_id'], sort = False)
    df_new = df_new.merge(right = pivot2, how = 'left', on = ['date_block_num','shop_id','item_id'], sort = False)
    
    df_new.rename(columns={'item_cnt_day': 'item_cnt_month_uncl'}, inplace=True)
    df_new['item_cnt_month_uncl'] = df_new['item_cnt_month_uncl'].fillna(0)
    df_new['item_cnt_month'] = df_new['item_cnt_month_uncl'].clip(0,20)
    df_new['revenue'] = df_new['revenue'].fillna(0)
    df_new['purch_cnt_month'] = df_new['purch_cnt_month'].fillna(0)
    df_new['ID'] = -1
    return df_new

In [10]:
%%time
df = create_testlike_train(sales)

CPU times: user 12.7 s, sys: 1.59 s, total: 14.2 s
Wall time: 14.3 s


In [11]:
test['date_block_num'] = df['date_block_num'].max()+1
test['month_start'] = df['month_start'].max() + pd.tseries.offsets.DateOffset(months = 1)
test['item_cnt_month'] = 0
test['item_cnt_month_uncl'] = 0
test['revenue'] = 0
test['purch_cnt_month'] = 0

#test['sh_it_key'] = test['shop_id'].astype(str) + ['-']*len(test['shop_id']) + test['item_id'].astype(str)
#train_key = list(set(sales['shop_id'].astype(str) + ['-']*len(sales['shop_id']) + sales['item_id'].astype(str)))
#test['was_in_s_it_sh'] = test['sh_it_key'].apply(lambda x: 1 if x in train_key else 0)
#test.drop('sh_it_key', inplace=True, axis=1)

In [12]:
# concat train and test to a single df
df = df[['ID','date_block_num','month_start','shop_id','item_id','item_cnt_month_uncl','item_cnt_month','purch_cnt_month','revenue']]
test = test[['ID','date_block_num','month_start','shop_id','item_id','item_cnt_month_uncl','item_cnt_month','purch_cnt_month','revenue']]
df = pd.concat([df,test], ignore_index=True, join = 'inner')
#del(test)
df.shape

(8812244, 9)

In [13]:
df['ID'] = df['ID'].astype('int32')
df['date_block_num'] = df['date_block_num'].astype('int8')
df['shop_id'] = df['shop_id'].astype('int8')
df['item_id'] = df['item_id'].astype('int16')
df['item_cnt_month'] = df['item_cnt_month'].astype('float32')
df['item_cnt_month_uncl'] = df['item_cnt_month_uncl'].astype('float32')
df['revenue'] = df['revenue'].astype('float32')
df['purch_cnt_month'] = df['purch_cnt_month'].astype('float32')
#df['was_in_s_it_sh'] = df['was_in_s_it_sh'].astype('int8')

In [14]:
#add months and days in month features
df['month'] = df['month_start'].dt.month.astype('int8')
#df['year'] = df['month_start'].dt.year.astype('int16')
df.drop(['month_start'], axis = 1, inplace = True)

days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
df['days_in_m'] = (df['month']-1).map(days).astype('int8')

In [15]:
# add city and type features
shops['shop_city'] = shops['shop_name'].apply(lambda x: x.split()[0])
shops['shop_type'] = shops['shop_name'].apply(lambda x: x.split()[1])

In [16]:
# add item categories features
item_categories['split'] = item_categories['item_category_name'].str.split('-')
item_categories['item_category_type'] = item_categories['split'].map(lambda x: x[0].strip())
item_categories['item_category_subtype'] = item_categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
item_categories.drop('split', axis = 1, inplace = True)

df = df.merge(items, 
              how='left', 
              on='item_id').merge(item_categories, 
                                  how ='left', 
                                  on='item_category_id').merge(shops, how = 'left', on='shop_id')

df['item_category_id'] = df['item_category_id'].astype('int8')

In [17]:
# encode categorical features
features_to_encode=['shop_city',
                    'shop_type',
                    'item_category_type',
                    'item_category_subtype']
def encode_cat_features(df,features_to_encode):
    for feat in features_to_encode:
        df[feat+'_encoded'] = LabelEncoder().fit_transform( df[feat] )
    df.drop(features_to_encode, axis = 1, inplace = True)
    return df

df = encode_cat_features(df,features_to_encode)

df.drop(['item_category_name','shop_name','item_name'], axis = 1, inplace = True)

In [18]:
# add mean encoded features
def add_mean_encoded_feat(df):
    pivot_it = pd.pivot_table(df, values = ['item_cnt_month',], index = ['item_id','date_block_num'], aggfunc = ['sum','count']).reset_index()
    pivot_it.columns = ['item_id','date_block_num','item_cnt_month_sum','item_cnt_month_cnt']
    pivot_it['lagged_it_mean'] = ((pivot_it.groupby(['item_id'])['item_cnt_month_sum'].cumsum() - pivot_it['item_cnt_month_sum'])/(pivot_it.groupby(['item_id'])['item_cnt_month_cnt'].cumsum() - pivot_it['item_cnt_month_cnt'])).fillna(0)
    pivot_it.drop(['item_cnt_month_sum','item_cnt_month_cnt'], axis = 1, inplace = True)
    df =  df.merge(right = pivot_it, how = 'left', on = ['item_id','date_block_num'], sort = False)
    
    pivot_sh_it = pd.pivot_table(df, values = 'item_cnt_month', index = ['shop_id','item_id','date_block_num'], aggfunc = 'sum').reset_index()
    pivot_sh_it['lagged_sh_it_mean'] = ((pivot_sh_it.groupby(['shop_id','item_id'])['item_cnt_month'].cumsum() - pivot_sh_it['item_cnt_month'])/(pivot_sh_it.groupby(['shop_id','item_id'])['item_cnt_month'].cumcount())).fillna(0)
    pivot_sh_it.drop(['item_cnt_month'], axis = 1, inplace = True)
    df =  df.merge(right = pivot_sh_it, how = 'left', on = ['shop_id','item_id','date_block_num'], sort = False)
    df['lagged_it_mean'] = df['lagged_it_mean'].astype('float32')
    df['lagged_sh_it_mean'] = df['lagged_sh_it_mean'].astype('float32')
    return df

In [19]:
%%time
df = add_mean_encoded_feat(df)

CPU times: user 5.41 s, sys: 1.03 s, total: 6.44 s
Wall time: 6.45 s


In [20]:
#add lag features
def add_lag_feat(df, col_to_agg, group_levels, n_lags, aggfunc = 'mean', clip = False):
    new_col_title_code = '_'.join([x for x in group_levels if x != 'date_block_num'])
    pivot = pd.pivot_table(df, values = col_to_agg, index = group_levels, aggfunc = aggfunc).reset_index()        
    pivot.rename(columns = {col_to_agg[0] : col_to_agg[0] + '_' + aggfunc}, inplace = True)
    idx_cols = ['date_block_num','shop_id','item_id']
    cols = list(set(idx_cols+group_levels))
    df_tech = df[cols].copy()
    df_tech = df_tech.merge(right = pivot, how = 'left', on = group_levels, sort = False)
    list_of_new_col = [] 
    for lag in n_lags:
        df_to_shift = df_tech[idx_cols+[col_to_agg[0] + '_' + aggfunc]].copy()
        df_to_shift['date_block_num'] = df_to_shift['date_block_num'] + lag
        df_to_shift.rename(columns={col_to_agg[0] + '_' + aggfunc : col_to_agg[0]+'_'+new_col_title_code + '_' + aggfunc+ '_lag_'+str(lag)}, inplace=True)
        list_of_new_col.append(col_to_agg[0]+'_'+new_col_title_code + '_' + aggfunc+ '_lag_'+str(lag))
        df= df.merge(right = df_to_shift, how = 'left', on = idx_cols, sort = False)
    for col in list_of_new_col:
        df[col] = df[col].fillna(0).astype('float32')
        if clip:
            df[col] = df[col].clip(0,20)
    return df

In [21]:
# %%time
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','shop_id','item_id'], 
#                   n_lags = [1,2,3], 
#                   aggfunc = 'sum')

# df = add_lag_feat(df, 
#                   col_to_agg = ['purch_cnt_month'],
#                   group_levels = ['date_block_num','shop_id','item_id'], 
#                   n_lags = [1,2], 
#                   aggfunc = 'sum')

In [22]:
# df['it_cnt_sh_it_lag_avg'] = df[['item_cnt_month_shop_id_item_id_sum_lag_1', 
#                                  'item_cnt_month_shop_id_item_id_sum_lag_2', 
#                                  'item_cnt_month_shop_id_item_id_sum_lag_3']].mean(skipna=True, axis=1)

# df['it_cnt_sh_it_lag_grad'] = df['item_cnt_month_shop_id_item_id_sum_lag_1']/df['item_cnt_month_shop_id_item_id_sum_lag_2']

# df['it_cnt_sh_it_lag_avg'] = df['it_cnt_sh_it_lag_avg'].astype('float32')
# df['it_cnt_sh_it_lag_grad'] = df['it_cnt_sh_it_lag_grad'].replace([np.inf, -np.inf], np.nan).fillna(0).astype('float32')

In [23]:
# %%time
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num'], 
#                   n_lags = [1])

# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','item_id'], 
#                   n_lags = [1,2])
                   
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','shop_id'], 
#                   n_lags = [1,2])
# # df = add_lag_feat(df, 
# #                   col_to_agg = ['revenue'],
# #                   group_levels = ['date_block_num','shop_id'], 
# #                   n_lags = [1])
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','item_category_id'], 
#                   n_lags = [1])
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','shop_id','item_category_id'], 
#                   n_lags = [1])
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','shop_id','item_category_type_encoded'], 
#                   n_lags = [1])
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','shop_id','item_category_subtype_encoded'], 
#                   n_lags = [1])
# # df = add_lag_feat(df, 
# #                   col_to_agg = ['item_cnt_month'],
# #                   group_levels = ['date_block_num','shop_city_encoded'], 
# #                   n_lags = [1])
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','shop_city_encoded','item_id'], 
#                   n_lags = [1])
# # df = add_lag_feat(df, 
# #                   col_to_agg = ['item_cnt_month'],
# #                   group_levels = ['date_block_num','shop_city_encoded','item_category_id'], 
# #                   n_lags = [1])
# # df = add_lag_feat(df, 
# #                   col_to_agg = ['item_cnt_month'],
# #                   group_levels = ['date_block_num','shop_city_encoded','item_category_type_encoded'], 
# #                   n_lags = [1])
# # df = add_lag_feat(df, 
# #                   col_to_agg = ['item_cnt_month'],
# #                   group_levels = ['date_block_num','shop_city_encoded','item_category_subtype_encoded'], 
# #                   n_lags = [1])
# # df = add_lag_feat(df, 
# #                   col_to_agg = ['item_cnt_month'],
# #                   group_levels = ['date_block_num','shop_type_encoded'], 
# #                   n_lags = [1])
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','shop_type_encoded','item_id'], 
#                   n_lags = [1])
# # df = add_lag_feat(df, 
# #                   col_to_agg = ['item_cnt_month'],
# #                   group_levels = ['date_block_num','shop_type_encoded','item_category_subtype_encoded'], 
# #                   n_lags = [1])
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','item_category_type_encoded'], 
#                   n_lags = [1])
# df = add_lag_feat(df, 
#                   col_to_agg = ['item_cnt_month'],
#                   group_levels = ['date_block_num','item_category_subtype_encoded'], 
#                   n_lags = [1])

In [24]:
columns_to_exclude = ['ID',
                      'item_cnt_month',
                      'item_cnt_month_uncl',
                      'revenue',
                      'purch_cnt_month',
                      #'months_since_sh_it_first_s',
                      #'months_since_it_first_s',
                      #'months_since_sh_first_s', 
                      #'avg_price_global', 
#                      'avg_price_mnth_lag1', 
#                      'avg_price_mnth_lag2',
#                      'avg_price_mnth_sh_lag1', 
#                      'avg_price_mnth_sh_lag2'
#                      'avg_price_mnth_to_gl', 
#                      'avg_price_mnth_sh_to_gl', 
#                      'months_from_sh_it_last_s',
#                      'months_from_it_last_s',
                      'lagged_sh_it_mean',
                      'lagged_it_mean',
#                      'it_had_sales_before'
#                      'sh_it_had_sales_before'
                     ]
cat_features = ['month',
#                'year',
                'shop_id',
                'shop_city_encoded',
                'shop_type_encoded',
                'item_category_id',
                'item_category_type_encoded',
                'item_category_subtype_encoded',
                'days_in_m'
               ]

In [30]:
df.columns

Index(['ID', 'date_block_num', 'shop_id', 'item_id', 'item_cnt_month_uncl',
       'item_cnt_month', 'purch_cnt_month', 'revenue', 'month', 'days_in_m',
       'item_category_id', 'shop_city_encoded', 'shop_type_encoded',
       'item_category_type_encoded', 'item_category_subtype_encoded',
       'lagged_it_mean', 'lagged_sh_it_mean'],
      dtype='object')

In [38]:
print(train_data.data.columns)


Index(['date_block_num', 'shop_id', 'item_id', 'month', 'days_in_m',
       'item_category_id', 'shop_city_encoded', 'shop_type_encoded',
       'item_category_type_encoded', 'item_category_subtype_encoded'],
      dtype='object')


In [36]:
# modeling
params = {'metric': 'rmse',
          'objective': 'mse',
          'num_leaves': 255,
          'learning_rate': 0.005,
          'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5,
          'force_col_wise' : True,
          'random_state': 10}

# Prepare training and validation datasets with categorical features
train_data = lgb.Dataset(
    df[(df['date_block_num'] >= 19) & (df['date_block_num'] < 33)].drop(columns_to_exclude, axis=1),
    label=df[(df['date_block_num'] >= 19) & (df['date_block_num'] < 33)]['item_cnt_month'],
    categorical_feature=cat_features
)

valid_data = lgb.Dataset(
    df[df['date_block_num'] == 33].drop(columns_to_exclude, axis=1),
    label=df[df['date_block_num'] == 33]['item_cnt_month'],
    categorical_feature=cat_features,
    reference=train_data
)




In [39]:
# Train model
lgb_model = lgb.train(
    params=params,
    train_set=train_data,
    num_boost_round=1500,
    valid_sets=[train_data, valid_data],
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
)

[LightGBM] [Info] Total Bins 524
[LightGBM] [Info] Number of data points in the train set: 3224048, number of used features: 10
[LightGBM] [Info] Start training from score 0.301487
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.11226	valid_1's rmse: 0.996967
[200]	training's rmse: 1.06147	valid_1's rmse: 0.961695


KeyboardInterrupt: 