In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns 

from itertools import product

from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

In [2]:
class ETL:
    def __init__(self):
        self.common_data_path = '../input/competitive-data-science-predict-future-sales'

        self.sales_train = None
        self.shops = None
        self.test = None
        self.item_categories = None
        self.items = None

    def extract(self) -> None:
        self.sales_train = pd.read_csv(f'{self.common_data_path}/sales_train.csv')
        self.shops = pd.read_csv(f'{self.common_data_path}/shops.csv')
        self.test = pd.read_csv(f'{self.common_data_path}/test.csv')
        self.item_categories = pd.read_csv(f'{self.common_data_path}/item_categories.csv')
        self.items = pd.read_csv(f'{self.common_data_path}/items.csv')

    def transform(self) -> None:
        self.sales_train.date = pd.to_datetime(self.sales_train.date)
        self.sales_train['month'] = self.sales_train.date.dt.to_period('M')

        self.sales_train = self.sales_train[self.sales_train.item_cnt_day > -2]
        self.sales_train = self.sales_train[self.sales_train.item_price < 300_000]
        self.sales_train = self.sales_train[self.sales_train.month < '2015-11']

        id_of_duplicated_shops = {10: 11, 0: 57, 1: 58, 40: 39}

        for k, v in id_of_duplicated_shops.items():
            self.shops = self.shops[self.shops.shop_id != k]
            self.sales_train.loc[self.sales_train.shop_id == k, 'shop_id'] = v

        self.test.loc[self.test.shop_id == 10, 'shop_id'] = 11

        self.shops.shop_name = self.shops.shop_name.map(lambda x: x.lstrip('!'))

        self.shops['city'] = self.shops.shop_name.str.split(' ').map(lambda x: x[0])
        self.shops['category'] = self.shops.shop_name.str.split(' ').map(lambda x: x[1])

        shop_enc = LabelEncoder()
        self.shops['shop_category'] = shop_enc.fit_transform(self.shops['category'])
        shop_city_enc = LabelEncoder()
        self.shops['shop_city'] = shop_city_enc.fit_transform(self.shops['city'])
        self.shops = self.shops[['shop_id', 'shop_category', 'shop_city']]

        self.sales_train = self.sales_train.drop_duplicates(
            subset=['date', 'date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day'],
            keep='last')

        self.items.item_name = self.items.item_name.map(lambda x: x.lstrip('!*/'))

        same_items_regardless_case = self.items[self.items.item_name.map(lambda x: x.lower()).duplicated()]
        fully_same = self.items.groupby(self.items.item_name, as_index=False).size().query('size > 1')
        fully_same_item_names = fully_same.item_name.values

        self.items = self.items[self.items.item_id != 12]
        self.sales_train = self.sales_train[self.sales_train.item_id != 12]

        same_items_regardless_case_names = same_items_regardless_case[~same_items_regardless_case.item_name.isin(
            fully_same_item_names)].item_name

        same_items = self.items[self.items.item_name.map(lambda x: x.lower()).isin(
            same_items_regardless_case_names.map(lambda x: x.lower()))]

        same_items_upper_case = same_items[same_items.item_name.str.endswith('(Регион)')]
        same_items_lower_case = same_items[same_items.item_name.str.endswith('(регион)')]

        self.items = self.items[~self.items.item_id.isin(same_items_lower_case.item_id)]

        self.sales_train.loc[self.sales_train.item_id.isin(same_items_lower_case.item_id), 'item_id'] = \
            self.sales_train[self.sales_train.item_id.isin(same_items_lower_case.item_id)].item_id.map(lambda x: x - 1)

        self.test.loc[self.test.item_id.isin(same_items_lower_case.item_id), 'item_id'] = \
            self.test[self.test.item_id.isin(same_items_lower_case.item_id)].item_id.map(lambda x: x - 1)

        self.items = self.items[self.items.item_id != 13012]
        self.sales_train.loc[self.sales_train.item_id == 13012, 'item_id'] = 13011
        self.test.loc[self.test.item_id == 13012, 'item_id'] = 13011

        self.items['name1'], self.items['name2'] = self.items['item_name'].str.split('[', 1).str
        self.items['name1'], self.items['name3'] = self.items['item_name'].str.split('(', 1).str

        self.items['name2'] = self.items['name2'].str.replace('\W+', ' ').str.lower()
        self.items['name3'] = self.items['name3'].str.replace('\W+', ' ').str.lower()

        self.items = self.items.fillna('0')

        self.items['name2'] = LabelEncoder().fit_transform(self.items['name2'])
        self.items['name3'] = LabelEncoder().fit_transform(self.items['name3'])

        self.items.drop(['item_name', 'name1'], axis=1, inplace=True)

        self.item_categories['type_code'] = self.item_categories['item_category_name'].apply(
            lambda x: x.split(' ')[0]).astype(str)

        categories = []
        for cat in self.item_categories['type_code'].unique():
            if len(self.item_categories[self.item_categories['type_code'] == cat]) > 3:
                categories.append(cat)
        self.item_categories['type_code'] = self.item_categories['type_code'].apply(
            lambda c: c if c in categories else 'other')

        self.item_categories['type_code'] = LabelEncoder().fit_transform(self.item_categories['type_code'])
        self.item_categories['subcat'] = self.item_categories['item_category_name'].apply(lambda x: x.split('-')).apply(
            lambda x: x[1].strip() if len(x) >= 2 else x[0].strip())

        self.item_categories['subcat'] = LabelEncoder().fit_transform(self.item_categories['subcat'])
        self.item_categories.drop('item_category_name', axis=1, inplace=True)

    def load(self):
        
        return self.sales_train, self.shops, self.test, self.item_categories, self.items

In [4]:
etl = ETL()
etl.extract()
etl.transform()

sales, shops, test, item_categories, items = etl.load()



In [10]:
class RollingWindowCV(object):
    
    def __init__(self, train_period=12, test_period=1, gap=1):
        self.n_splits = 0
        self.train_period = train_period
        self.test_period = test_period
        self.gap = gap
    
    def split(self, data, 
              date_column='date_block_num', 
              target_column='item_cnt_month', 
              clip_range: list = [0, 20]):

        try:
            data[date_column]
        except:
            raise KeyError(date_column)
        
        start_train = int(data[date_column].min())
        end_train = start_train + self.train_period
        start_test = end_train + self.gap
        end_test = start_test + self.test_period

        while end_test < data[date_column].max():
            train_indices = list(data[(data[date_column] >= start_train) & 
                                     (data[date_column] < end_train)].index)


            test_indices = list(data[(data[date_column] >= start_test) &
                                    (data[date_column] < end_test)].index)
            
            print("Train period:", start_train,"-" , end_train, ", Test period", start_test, "-", end_test,
                  "# train records", len(train_indices), ", # test records", len(test_indices))

            start_train = end_train + 1
            end_train = start_train + self.train_period
            start_test = end_train + self.gap
            end_test = start_test + self.test_period
            
            self.n_splits += 1
            
            train_data = data.loc[train_indices]
            X_train = train_data.drop([target_column], axis=1).fillna(0)
            y_train = train_data[target_column].clip(*clip_range)
            
            valid_data = data.loc[test_indices]
            X_valid = valid_data.drop([target_column], axis=1).fillna(0)
            y_valid = valid_data[target_column].clip(*clip_range)
            
            yield X_train, y_train, X_valid, y_valid
    
    
    def get_n_splits(self):
        return self.n_splits 

In [11]:
class TrainEngineering:
    def __init__(self, data):
        self.data: pd.DataFrame = data
        self.train: pd.DataFrame = None
        
    def create_train(self,
                     cols: list, 
                     data_column: str, 
                     unique_cols: list):
        train = []
        for data_block in range(self.data[data_column].nunique()):
            train_temp = self.data[self.data[data_column] == data_block]
            train.append(np.array(list(product([data_block], *[train_temp[unique_col].unique() 
                                                                  for unique_col in unique_cols]))))

        train = pd.DataFrame(np.vstack(train), columns=cols)
        self.train = train.sort_values(cols)
        
    def add_aggregated_data(self,
                            groupby_cols: list,
                            agg_col_funcs: dict,
                            agg_cols_name: list):
        aggregated_data = self.data.groupby(groupby_cols).agg(agg_col_funcs)
        aggregated_data.columns = agg_cols_name
        aggregated_data.reset_index(inplace=True)
        
        self.merge_tables([(groupby_cols, aggregated_data)])
        self.train = self.train.fillna(0)
        
    def add_lags(self, periods, lag_cols, 
                 date_block_column='date_block_num', 
                 agg_cols=['date_block_num', 'shop_id', 'item_id']):
        for lag_col in lag_cols:
            aggregated = self.train[[*agg_cols, lag_col]]
            for per in periods:
                shifted = aggregated.copy()
                shifted.columns = [*agg_cols, lag_col + "_lag_" + str(per)]
                shifted[date_block_column] = shifted[date_block_column] + per
                self.merge_tables([(agg_cols, shifted)])
                self.train = self.train.fillna(0)
        
    def merge_tables(self, table_keys: list):
        for key, table in table_keys:
            self.train = pd.merge(self.train, table, on=key, how='left')
    
    def concat(self, data, cols: list):
        self.train = pd.concat([self.train, data], 
                               ignore_index=True, 
                               sort=False, 
                               keys=cols).fillna(0)
            
    def get_train(self):
        return self.train

In [12]:
class TestEngineering:
    def __init__(self, data):
        self.test: pd.DataFrame = data
            
    def select_features(self, cols: list):
        self.test = self.test[cols]
        
    def add_column(self, col: str, values: any):
        self.test[col] = values

    def get_test(self):
        return self.test

In [18]:
class ModelCV:
    def __init__(self,
                 train_data: pd.DataFrame,
                 test_data: pd.DataFrame,
                 items: pd.DataFrame, 
                 item_categories: pd.DataFrame, 
                 shops: pd.DataFrame,
                 train_eng_class, 
                 test_eng_class, 
                 cv_class):
        self.train_data = train_data
        self.test_data = test_data
        self.items = items
        self.item_categories = item_categories
        self.shops = shops
        self.train_eng_class = train_eng_class
        self.test_eng_class = test_eng_class
        self.cv_class = cv_class
        
        self.train_eng = None
        self.test_eng = None
        
    def create_train_table(self):
        self.train_eng = self.train_eng_class(self.train_data)

        self.train_eng.create_train(['date_block_num', 'shop_id', 'item_id'], 
                               'date_block_num', 
                               ['shop_id', 'item_id'])

        self.train_eng.add_aggregated_data(['date_block_num', 'shop_id', 'item_id'], 
                                      {'item_cnt_day': 'sum'},
                                      ['item_cnt_month'])
        
        self.test_eng = self.test_eng_class(self.test_data)
        
        self.test_eng.select_features(['shop_id', 'item_id'])
        self.test_eng.add_column('date_block_num', self.train_data.date_block_num.max() + 1)
        
        self.train_eng.concat(self.test_eng.get_test(), ['date_block_num', 'shop_id', 'item_id'])
        self.train = self.train_eng.get_train()
        
        self.train_eng.merge_tables([('item_id', self.items), 
                        ('item_category_id', self.item_categories),
                        ('shop_id', self.shops)])
        
        self.train_eng.add_lags([1, 2, 12], ['item_cnt_month'])
        
        self.train_data = self.train_eng.get_train()
        
    def get_splits(self, traget_col: str):
        cv = self.cv_class()
        for X_train, y_train, X_valid, y_valid in cv.split(self.train_data):
            if X_train is not None:
                yield X_train, y_train, X_valid, y_valid
            else: 
                raise StopIteration

In [19]:
tscv = ModelCV(sales, test, items, 
               item_categories, shops, TrainEngineering, 
               TestEngineering, RollingWindowCV)
tscv.create_train_table()
models = []
for X_train, y_train, X_valid, y_valid in tscv.get_splits('item_cnt_month'):

    model = XGBRegressor(
       max_depth=8,
       n_estimators=1000,
       min_child_weight=0.8, 
       colsample_bytree=0.8, 
       subsample=0.8, 
       eta=0.1,
       used_ram_limit= "13gb",
    )

    model.fit(
       X_train, 
       y_train, 
       eval_metric='rmse',
       eval_set=[(X_train, y_train), (X_valid, y_valid)], 
       verbose=True, 
       early_stopping_rounds=10
    )
    
    error = np.sqrt(mean_squared_error(y_valid, model.predict(X_valid), squared=False))
    
    models.append((error, model))

Train period: 0 - 12 , Test period 13 - 14 # train records 4486019 , # test records 326922




Parameters: { "used_ram_limit" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:1.24281	validation_1-rmse:1.20520
[1]	validation_0-rmse:1.20639	validation_1-rmse:1.14942
[2]	validation_0-rmse:1.17516	validation_1-rmse:1.10182
[3]	validation_0-rmse:1.15738	validation_1-rmse:1.06735
[4]	validation_0-rmse:1.13390	validation_1-rmse:1.03659
[5]	validation_0-rmse:1.12106	validation_1-rmse:1.01505
[6]	validation_0-rmse:1.11025	validation_1-rmse:0.99971
[7]	validation_0-rmse:1.10107	validation_1-rmse:0.98829
[8]	validation_0-rmse:1.08599	validation_1-rmse:0.97679
[9]	validation_0-rmse:1.07957	validation_1-rmse:0.97308
[10]	validation_0-rmse:1.06838	validation_1-rmse:0.96575
[11]	validation_0-rmse:1.05805	validation_1-rmse:0.96221
[12]	



Parameters: { "used_ram_limit" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:1.21356	validation_1-rmse:1.03962
[1]	validation_0-rmse:1.18009	validation_1-rmse:1.00231
[2]	validation_0-rmse:1.14284	validation_1-rmse:0.96772
[3]	validation_0-rmse:1.11842	validation_1-rmse:0.94205
[4]	validation_0-rmse:1.09770	validation_1-rmse:0.92216
[5]	validation_0-rmse:1.08117	validation_1-rmse:0.90550
[6]	validation_0-rmse:1.05776	validation_1-rmse:0.88839
[7]	validation_0-rmse:1.04928	validation_1-rmse:0.88132
[8]	validation_0-rmse:1.03856	validation_1-rmse:0.87233
[9]	validation_0-rmse:1.02108	validation_1-rmse:0.86124
[10]	validation_0-rmse:1.00616	validation_1-rmse:0.85249
[11]	validation_0-rmse:0.99397	validation_1-rmse:0.84495
[12]	

In [20]:
final_model =  min(models, key = lambda score: score[0])[1]

In [23]:
train = tscv.train_eng.get_train()

In [24]:
X_test = train[train.date_block_num == 34]

In [25]:
X_test = X_test.drop('item_cnt_month', axis=1)

In [27]:
X_test = X_test.fillna(0)

In [28]:
predictions = final_model.predict(X_test)

In [29]:
submission = {'ID': test.index, 'item_cnt_month': predictions}
submission = pd.DataFrame(submission)

In [30]:
submission.to_csv('submission.csv', index=False)