In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# import tensorflow as tf

# Misc

## Send a push notification with Pushbullet

In [None]:
import requests
import json
def pushbullet_message(title, body):
    msg = {"type": "note", "title": title, "body": body}
    TOKEN = 'o.eE2zQwESe8DAJM0IlglDUdkTpmYkYT2L'
    resp = requests.post('https://api.pushbullet.com/v2/pushes', 
                         data=json.dumps(msg),
                         headers={'Authorization': 'Bearer ' + TOKEN,
                                  'Content-Type': 'application/json'})
    if resp.status_code != 200:
        raise Exception('Error',resp.status_code)
    else:
        print ('Message sent')

# Data loading and saving snippets

In [2]:
item_categories = pd.read_csv("data/item_categories.csv")
items = pd.read_csv("data/items.csv")
sales_train = pd.read_csv("data/sales_train.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")
shops = pd.read_csv("data/shops.csv")
test = pd.read_csv("data/test.csv")

In [4]:
# Save in hdf5 format
item_categories.to_hdf("data/item_categories.h5",key="item_categories",mode="w")
items.to_hdf("data/items.h5","items",mode="w")
sales_train.to_hdf("data/sales_train.h5","sales_train",mode="w")
sample_submission.to_hdf("data/sample_submission.h5","sample_submission",mode="w")
shops.to_hdf("data/shops.h5","shops",mode="w")
test.to_hdf("data/test.h5","test",mode="w")

In [86]:
# Load from hdf5 format (separate files for simplicity)
item_categories = pd.read_hdf("data/item_categories.h5")
items = pd.read_hdf("data/items.h5")
sales_train = pd.read_hdf("data/sales_train.h5")
sample_submission = pd.read_hdf("data/sample_submission.h5")
shops = pd.read_hdf("data/shops.h5")
test = pd.read_hdf("data/test.h5")

In [2]:
# Convert the item categories xls to csv while adding integer encoding of categories
cat = pd.read_excel('data_eng/item_categories_enhanced.xls', header=1)
cat['supercategory_id'] = cat['supercategory'].factorize()[0]
cat['platform_id'] = cat['platform'].factorize()[0]
cat.to_csv('data_eng/item_categories_enhanced.csv', index=False)

In [4]:
# pickles
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Data cleaning snippets

## Duplicate shops

In [None]:
# From https://www.kaggle.com/dlarionov/feature-engineering-xgboost

# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

# From https://www.kaggle.com/tylerssssss/feature-engineering-lightgbm

train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57

train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58

train.loc[train.shop_id == 40, 'shop_id'] = 39
test.loc[test.shop_id == 40, 'shop_id'] = 39

## Outliers

In [None]:
# Remove outliers and 1 item with a negative sales price
sales_train = sales_train[(sales_train.item_price < 500000) & (sales_train.item_price > 0)]
sales_train = sales_train[sales_train.item_cnt_day < 1001]

# Feature engineering snippets

## First word & artist name extractor

In [1]:
import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords
import re

def add_name_features(
    matrix, factorize=True, items=items, fillna_value=-99, feature_name="artist_name_or_first_word",
    prune_quantile=0.25
):
    # This extracts artist names for music categories and adds them as a feature.
    def extract_artist(st):

        st = st.strip()
        if st.startswith("V/A"):
            artist = "V/A"
        elif st.startswith("СБ"):
            artist = "СБ"
        else:
            # Retrieves artist names using the double space or all uppercase pattern
            mus_artist_dubspace = re.compile(r".{2,}?(?=\s{2,})")
            match_dubspace = mus_artist_dubspace.match(st)
            mus_artist_capsonly = re.compile(r"^([^a-zа-я]+\s)+")
            match_capsonly = mus_artist_capsonly.match(st)
            candidates = [match_dubspace, match_capsonly]
            candidates = [m[0] for m in candidates if m is not None]
            # Sometimes one of the patterns catches some extra words so choose the shortest one
            if len(candidates):
                artist = min(candidates, key=len)
            else:
                # If neither of the previous patterns found something, use the dot-space pattern
                mus_artist_dotspace = re.compile(r".{2,}?(?=\.\s)")
                match = mus_artist_dotspace.match(st)
                if match:
                    artist = match[0]
                else:
                    artist = ""
        artist = artist.upper()
        artist = re.sub(r"[^A-ZА-Я ]||\bTHE\b", "", artist)
        artist = re.sub(r"\s{2,}", " ", artist)
        artist = artist.strip()
        return artist

    def first_word(string):
        # This cleans the string of special characters, excess spaces and stopwords then extracts the first word
        all_stopwords = stopwords.words("russian")
        all_stopwords = all_stopwords + stopwords.words("english")
        string = re.sub(r"[^\w\s]", "", string)
        string = re.sub(r"\s{2,}", " ", string)
        tokens = string.lower().split()
        tokens = [t for t in tokens if t not in all_stopwords]
        token = tokens[0] if len(tokens) > 0 else ""
        return token

    music_categories = [55, 56, 57, 58, 59, 60]
    items.loc[items.item_category_id.isin(music_categories), feature_name] = items.loc[items.item_category_id.isin(music_categories), "item_name"].apply(
        extract_artist
    )

    items.loc[items[feature_name] == "", feature_name] = "other music"

    items.loc[~items.item_category_id.isin(music_categories), feature_name] = items.loc[~items.item_category_id.isin(music_categories), "item_name"].apply(
        first_word
    )

    items.loc[items[feature_name] == "", feature_name] = "other non-music"

    if factorize:
        items[feature_name] = items[feature_name].factorize(na_sentinel=fillna_value)[0]

    matrix = matrix.merge(items[["item_id", feature_name]], on="item_id", how="left",)
    if fillna_value is not None:
        matrix[feature_name] = matrix[feature_name].fillna(fillna_value)
    
    if prune_quantile>0:  # replace low-occurence words / artists by a category marker
        vc = matrix['artist_name_or_first_word'].value_counts().rename('first_word_vcs')
        matrix = matrix.merge(vc, left_on='artist_name_or_first_word', right_index=True, how='left')
        prune_mask = matrix.first_word_vcs<=matrix['first_word_vcs'].quantile(prune_quantile)
        matrix.loc[prune_mask, 'artist_name_or_first_word'] = -matrix.loc[prune_mask, 'item_category_id']
        matrix = matrix.drop(columns='first_word_vcs')
    return matrix

NameError: name 'items' is not defined

## Create bag of word representations of item names

In [29]:
lexical_features = False
if lexical_features is not False:
    # Item name correction code
    if (dev_skip == False) & (lexical_features == True):
        import re

        def name_correction(x):
            x = x.lower()
            x = x.partition("[")[0]
            x = x.partition("(")[0]
            x = re.sub("[^A-Za-z0-9А-Яа-я]+", " ", x)
            x = x.replace("  ", " ")
            x = x.strip()
            return x

        items["item_name"] = items["item_name"].apply(lambda x: name_correction(x))

        # Function to create matrix of token representations of item names
        def create_item_id_bow_matrix(items):
            # Adds word token features to the items dataframe, taken from the item names

            # Fit a count vectorizer and create the bag of words matrix
            from nltk.corpus import stopwords
            from sklearn.feature_extraction.text import CountVectorizer

            russian_stopwords = stopwords.words("russian")
            vectorizer = CountVectorizer(stop_words=russian_stopwords)
            X = vectorizer.fit_transform(items.loc[:, "item_name"])

            # Convert the matrix to pandas sparse matrix format and concatenate it to the items matrix
            X = pd.DataFrame.sparse.from_spmatrix(X)
            items = pd.concat([items, X], axis=1)

            # Create appropriate column names for the token features
            print(f"{len(vectorizer.vocabulary_)} words")
            featuremap = {
                col: "token_" + token
                for col, token in zip(
                    range(len(vectorizer.vocabulary_)), vectorizer.get_feature_names()
                )
            }
            items = items.rename(columns=featuremap)

            return items

        items_bow = create_item_id_bow_matrix(items)
        items_bow = items_bow.drop(columns=["item_name", "item_category_id"])
        # Drop any tokens which only occur once
        items_bow = items_bow.drop(columns=items_bow.columns[(items_bow.sum(axis=0) < 2)])
        if bow_features:
            df = df.merge(items_bow, on="item_id", how="left")

17297 words


In [8]:
list(map(list, zip((1,2), (3,4), (5,6))))

[[1, 3, 5], [2, 4, 6]]

## Item name similarity feature
Calculates the mean sales of the n items with names most similar to each item, lagged by 1 month

In [48]:
# Create itemname similarity matrix
if (dev_skip == False) & (lexical_features == True):
    if load_data == False:
        # Convert to sparse matrix, drop first item_id column (sparse matrix multiplication much faster than dense)
        from scipy.sparse import csr_matrix

        ib = csr_matrix(items_bow.to_numpy()[:, 1:])
        # Compute dot product of BOW token matrix
        itemname_similarity = np.dot(ib, ib.T)
        itemname_similarity = pd.DataFrame.sparse.from_spmatrix(
            itemname_similarity, index=items.item_id, columns=items.item_id
        )
    else:
        itemname_similarity = pd.read_hdf("data/itemname_similarity.h5")

    del items_bow

In [None]:
import multiprocessing.dummy as mp # Using .dummy version of multiprocessing module because thread-based but not process-based parallelism worked on Windows

if (dev_skip==False) & (lexical_features==True):
    
    neighbours_list = [1,3]
    load_filename = f'data/similar_item_features/closest_itemname_ME_1.pk1.gz'
    save_filename = f'data/similar_item_features/closest_itemname_ME_1.pk1.gz'
    
    create_new = True 
    if load_data==True:
        create_new = False
        try:
            dfstore = pd.read_pickle(load_filename)
        except FileNotFoundError:
            print("Saved features not found, creating from scratch.")
            create_new = True
    if create_new==True:
        
        dfstore = df.loc[:,['date_block_num','item_id','item_cnt_month']]
        
        # This works faster with a dense array for some reason
        if pd.api.types.is_sparse(itemname_similarity[0]):
            itsim = itemname_similarity.sparse.to_dense()
        else:
            itsim = itemname_similarity
        
        for lag in lags:

            storelist = []
            
            for date_block_num in tqdm(range(3,35)):
                
                print(f"Date block {date_block_num}, lag {lag}")
                
                ids = dfstore.loc[dfstore.date_block_num==date_block_num,'item_id'].unique()

                # Get subset of items in lag month
                ids_prev = dfstore.loc[dfstore.date_block_num==(date_block_num-lag),'item_id'].unique()
                
                # Define a function to be used with the map function. Returns a dictionary of feature values for the current item_id, date_block_num and lag
                def sim_name_target(item_id):
                    tmpdict = {'date_block_num': date_block_num, 'item_id': item_id}
                    # Make set of ids without current item
                    ids_other = ids_prev[ids_prev!=item_id]
                    # Get similarity values for current item
                    itsim_prev = itsim.loc[item_id,ids_other]
                    
                    for neighbours in neighbours_list:
                        
                        item_featurename = f'item_id-{neighbours}closestitemname-ME-lag{lag}'

                        # Get top n similar items
                        itsim_prev_largest = itsim_prev.nlargest(neighbours)
                        # Remove zero values
                        itsim_prev_largest = itsim_prev_largest.loc[itsim_prev_largest>0]
                        # Get target values for similar items, save
                        targets_mean = dfstore.loc[(dfstore.date_block_num==(date_block_num-lag)) & (dfstore.item_id.isin(itsim_prev_largest.index)),['item_cnt_month']].mean()
                        tmpdict[item_featurename] = targets_mean.item()
                    return tmpdict

                pool = mp.Pool()
                tmplist = pool.map(sim_name_target, list(ids))
                # Add returned dictionary list to store list
                storelist = storelist + tmplist
            
            # Convert list of dicts to dataframe
            dftmp = pd.DataFrame(storelist)
            # Merge results with dfstore and save
            dfstore = dfstore.merge(dftmp, how='left', on=['date_block_num', 'item_id'])
            filename = f'data/similar_item_features/similar_items_ME_lag_{lag}.pk1.gz'
            dfstore.to_pickle(filename)

        try: 
            dfstore = dfstore.drop(columns='item_cnt_month')
        except KeyError:
            pass
        dfstore = reduce_mem_usage(dfstore)
        dfstore.to_pickle(save_filename)

    df = df.merge(dfstore, how="left", on=["date_block_num", "item_id"])
    
    df.to_pickle("checkpoints/dfcheckpoint_6.pk1")

## Shop cities and item subcategories
(from https://www.kaggle.com/dlarionov/feature-engineering-xgboost) \
\
Observations:

* Each shop_name starts with the city name.
* Each category contains type and subtype in its name.

In [89]:
from sklearn.preprocessing import LabelEncoder

shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shop_labels = shops[['shop_id','city_code']]

item_categories['split'] = item_categories['item_category_name'].str.split('-')
item_categories['type'] = item_categories['split'].map(lambda x: x[0].strip())
item_categories['type_code'] = LabelEncoder().fit_transform(item_categories['type'])
# if subtype is nan then type
item_categories['subtype'] = item_categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
item_categories['subtype_code'] = LabelEncoder().fit_transform(item_categories['subtype'])
item_cat_labels = item_categories[['item_category_id','type_code', 'subtype_code']]

shop_labels.to_csv('data/shop_labels.csv')
shop_labels.to_hdf('data/shop_labels.h5','shop_labels', mode='w')
item_cat_labels.to_csv('data/item_cat_labels.csv')
item_cat_labels.to_hdf('data/item_cat_labels.h5','item_cat_labels', mode='w')

## Lagged feature (e.g. target)

In [46]:
def add_lag_feature(df, feature_series, grouping_fields, lag, fillna=None):
    '''
    Lag a feature in "feature_series" by a specific amount "lag" and add it to datatable "df".
    Missing values caused by inconsistencies between items and shops in different date blocks
    are filled by the method specified by "fillna".
    
    Arguments:
    fillna: Missing value fill method, options "none", "ffill_only" (fill with last available value),
    "ffill_bfill" (fill with last present value then fill remaining values with next available value)
    
    '''
    feature_series = feature_series.copy()
    
    feature_name = feature_series.name + '-lag_' + str(lag)
    feature_series.name = feature_name
    
    feature_series = feature_series.reset_index()
    
    if type(grouping_fields) not in (str, list):
        raise TypeError
    if len(grouping_fields)==0:
        feature_series['date_block_num'] += lag
        return df.merge(feature_series, on=['date_block_num'], how='left')
    if type(grouping_fields)==str:
        grouping_fields = [grouping_fields]
        
    feature_series['date_block_num'] += lag
    
    df = df.merge(feature_series, on=['date_block_num'] + grouping_fields, how='left')
    
    if fillna == 'ffill_bfill':
        df[feature_name] = df.groupby(grouping_fields)[feature_name].fillna(method='ffill').fillna(method='bfill')
    elif fillna == 'ffill_only':
        df[feature_name] = df.groupby(grouping_fields)[feature_name].fillna(method='ffill')
    elif type(fillna) is int or type(fillna) is float:
        df[feature_name] = df.groupby(grouping_fields)[feature_name].fillna(fillna)
    elif fillna == None:
        pass
    else:
        raise ValueError
    
    return df

In [None]:
# Create series of target (item_cnt_month) then add lagged versions with add_lag_feature
if dev_skip==False:
    targetseries = df.loc[:,['date_block_num', 'item_id', 'shop_id', 'item_cnt_month']]
    targetseries = targetseries.set_index(['date_block_num', 'item_id', 'shop_id'])
    targetseries = targetseries['item_cnt_month']
    for lag in tqdm([1,2,3,12]):
        df = add_lag_feature(df, targetseries, ['item_id', 'shop_id'], lag, fillna=0)
    del(targetseries)

## Mean encoding
Use the rolling method somehow or just use the mean for each month and lag the feature by 1?

In [None]:
# Mean encode items, shops and categories by month and then lag with the add_lags function
item_ME = train.groupby(['date_block_num','item_id']).item_cnt_month.mean().rename('item_id_ME')
shop_ME = train.groupby(['date_block_num','shop_id']).item_cnt_month.mean().rename('shop_id_ME')
item_cat_ME = train.groupby(['date_block_num','item_category_id']).item_cnt_month.mean().rename('item_category_id_ME')
date_block_num_ME = train.groupby(['date_block_num']).item_cnt_month.mean().rename('date_block_num_ME')
shop_category_id_ME = df.groupby(['date_block_num','shop_id','item_category_id']).item_cnt_month.mean().rename('shop_id-item_category_id-ME')
shop_type_ME = df.groupby(['date_block_num','shop_id','type_code']).item_cnt_month.mean().rename('shop_id-type_code-ME')
shop_subtype_ME = df.groupby(['date_block_num','shop_id','subtype_code']).item_cnt_month.mean().rename('shop_id-subtype_code-ME')

lags = [1,2,3,12]

for lag in lags:
    train = add_lag_feature(train, item_ME, 'item_id', lag, fillna='ffill_bfill')
    train = add_lag_feature(train, shop_ME, 'shop_id', lag, fillna='ffill_bfill')
    train = add_lag_feature(train, item_cat_ME, 'item_category_id', lag, fillna='ffill_bfill')
    train = add_lag_feature(train, date_block_num_ME, '', lag, fillna='ffill_bfill')
    df = add_lag_feature(df, shop_category_id_ME, ['shop_id','item_category_id'], lag, fillna='ffill_bfill')
    df = add_lag_feature(df, shop_type_ME, ['shop_id','type_code'], lag, fillna='ffill_bfill')
    df = add_lag_feature(df, shop_subtype_ME, ['shop_id','subtype_code'], lag, fillna='ffill_bfill')

train

## Per-month rolling mean encoding features

In [None]:
def create_full_train_test(sales_train, test):
    # Create a train set with all items and all shops in all date blocks.
    # Aggregate sales to the month level
    sales_train_grouped = sales_train.groupby(
        ["date_block_num", "item_id", "shop_id"]
    ).agg({"item_cnt_day": "sum"})
    sales_train_grouped = sales_train_grouped.rename(
        columns={"item_cnt_day": "item_cnt_month"}
    )

    # Create sets of items, shops and date blocks
    item_ids = set(sales_train.item_id).union(set(test.item_id))
    shop_ids = set(sales_train.shop_id).union(set(test.shop_id))
    date_block_nums = set(sales_train.date_block_num).union(set([34]))

    # Create all permutations as indexes
    indexdataframe = pd.DataFrame(
        np.array(list(itertools.product(date_block_nums, item_ids, shop_ids))),
        columns=["date_block_num", "item_id", "shop_id"],
    )

    m = indexdataframe.merge(
        sales_train_grouped, how="left", on=["date_block_num", "item_id", "shop_id"]
    )
    m.item_cnt_month = m.item_cnt_month.fillna(0)

    shop_first_sales = (
        train.groupby("shop_id").date_block_num.min().rename("shop_first_month")
    )
    shop_last_sales = (
        train.groupby("shop_id").date_block_num.max().rename("shop_last_month")
    )
    item_first_sales = (
        train.groupby("item_id").date_block_num.min().rename("item_first_month")
    )
    m = m.merge(shop_first_sales, on="shop_id", how="left")
    m = m.merge(shop_last_sales, on="shop_id", how="left")
    m = m.merge(item_first_sales, on="item_id", how="left")

    mask = (
        (m.date_block_num < m.shop_first_month)
        | (m.date_block_num < m.item_first_month)
        | (m.date_block_num > m.shop_last_month)
    )

    m = m.drop(columns=['shop_first_month', 'shop_last_month', 'item_first_month'])
    
    m.loc[mask, "item_cnt_month"] = np.nan

    return m

In [None]:
m = create_full_train_test(train, test)

## Manual expanding mean feature (not using pandas expanding method

In [None]:
aggs = {"item_cnt_month":"mean"}
newname = 'shop_id_item_category_id_expanding_ME'
groupers = ["shop_id", "item_category_id"]
features = []
for dbn in tqdm(range(1, 35)):
    ids = matrix.query(f"date_block_num=={dbn}")['item_id'].unique()
    f_temp = (
        matrix.loc[(matrix.date_block_num<dbn) & (matrix.item_id.isin(ids)), :]
        .groupby(groupers)
        .agg(aggs)
        .rename(columns={'item_cnt_month':newname})
    )
    f_temp["date_block_num"] = dbn
    features.append(f_temp)
features = pd.concat(features).reset_index()

features = fu.reduce_mem_usage(features)

matrix = matrix.merge(features, on=["date_block_num"] + groupers, how="left")

The custom functions commented out below should have been replaced with the general function "add_rolling_ME"

In [None]:
# def add_rolling_item_shop_ME(matrix, window = 3, min_periods=1, ewm=False):
#     ''' Creates a rolling or exponential weighted mean item_cnt_month feature for each
#     item_id-shop_id combination and merges it with a feature matrix. Assumes the existence of complete
#     matrix of monthly shop-item counts "m". Parameter "window" acts as the halflife parameter if
#     ewm=True'''
#     im = m.groupby(['shop_id', 'item_id','date_block_num']).item_cnt_month.mean()
#     if ewm:
#         imr = im.groupby(['shop_id', 'item_id']).ewm(halflife=window, min_periods=min_periods).mean()
#         feat_name = f"shop_item_ME_ewm_hl_{window}"
#     else:
#         imr = im.groupby(['shop_id', 'item_id']).rolling(window=window, min_periods=min_periods).mean()
#         feat_name = f"shop_item_ME_rol_win_{window}"
#     im = im.reset_index()
#     im[feat_name] = imr.to_numpy()
#     im = im.drop(columns='item_cnt_month')
#     im['date_block_num'] += 1
#     matrix = matrix.merge(im, on=['date_block_num', 'shop_id', 'item_id'], how='left')
#     return matrix

In [None]:
# def add_rolling_item_ME(matrix, window = 3, min_periods=1, ewm=False):
#     ''' Creates a rolling mean item_cnt_month feature for each item_id and merges it
#     with a feature matrix. Assumes the existence of complete matrix of monthly shop-item
#     counts "m" '''
#     im = m.groupby(['item_id','date_block_num']).item_cnt_month.mean()
#     if ewm:
#         imr = im.groupby(['item_id']).ewm(halflife=window, min_periods=min_periods).mean()
#         feat_name = f"item_id_ME_ewm_hl_{window}"
#     else:
#         imr = im.groupby(['item_id']).rolling(window=window, min_periods=min_periods).mean()
#         feat_name = f"item_id_ME_rol_win_{window}"
#     im = im.reset_index()
#     im[feat_name] = imr.to_numpy()
#     im = im.drop(columns='item_cnt_month')
#     im['date_block_num'] += 1
#     matrix = matrix.merge(im, on=['date_block_num', 'item_id'], how='left')
#     return matrix

In [22]:
def add_rolling_ME(
    matrix, features, window=3, min_periods=1, ewm=False, source_matrix=None
):
    """Add a rolling mean item_cnt_month feature for a specificed categorical feature
    or features. Calculates using the feature matrix and therefore does not insert zeros
    for features with zero sales in a specific month (use feature specific rolling ME
    functions for rolling item or item-shop features)"""
    if type(features) != list:
        raise TypeError("features argument must be a list")
    if source_matrix is None:
        source_matrix = matrix
    im = source_matrix.groupby(features + ["date_block_num"]).item_cnt_month.mean()
    if ewm:
        imr = im.groupby(features).ewm(halflife=window, min_periods=min_periods).mean()
        feat_name = f"{'_'.join(features)}_ME_ewm_hl_{window}"
    else:
        imr = im.groupby(features).rolling(window=window, min_periods=min_periods).mean()
        feat_name = f"{'_'.join(features)}_ME_rol_win_{window}"
    im = im.reset_index()
    im[feat_name] = imr.to_numpy()
    im = im.drop(columns="item_cnt_month")
    im["date_block_num"] += 1
    matrix = matrix.merge(im, on=["date_block_num"] + features, how="left")
    return matrix

In [None]:
windows = [2, 6, 12]
for window in tqdm(windows, desc="shop_item"):
    matrix = add_rolling_ME(
        matrix, features=["shop_id", "item_id"], window=window, source_matrix=m
    )
for window in tqdm(windows):
    matrix = add_rolling_ME(matrix, features=["item_id"], window=window, source_matrix=m)
for features in tqdm(
    [
        "shop_id",
        "supercategory_id",
        "platform_id",
        "item_category_id",
        "digital",
        ["shop_id", "item_category_id"],
    ]
):
    for window in tqdm(windows):
        matrix = add_rolling_ME(matrix, features, window)

In [None]:
windows = [1]
for window in tqdm(windows):
    matrix = add_rolling_ME(
        matrix, features=["shop_id", "item_id"], window=window, ewm=True, source_matrix=m
    )
for window in tqdm(windows):
    matrix = add_rolling_ME(
        matrix, features=["item_id"], window=window, ewm=True, source_matrix=m
    )
for feature in tqdm(
    ["shop_id", "supercategory_id", "platform_id", "item_category_id", "digital"]
):
    for window in tqdm(windows):
        matrix = add_rolling_ME(matrix, feature, window, ewm=True)

In [None]:
del(m, windows)

## Simple periodicity feature (i.e. date block mod 12)

In [78]:
# Seasonality feature 
train['date_block_month'] = train.date_block_num % 12

## Days in current month

In [79]:
days_in_month = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
train['days_in_month'] = train['date_block_month'].map(days_in_month)

## Mean item price in previous month
Mean is calculated as the mean of all item sales rather than the mean of the mean price at each shop \
Also possible to include further lags, giving models opportunity to calculate changes in price over time

In [130]:
item_revenue_all_shops = df.groupby(['date_block_num','item_id'])['item_revenue_month'].sum()
item_count_all_shops = df.groupby(['date_block_num','item_id'])['item_cnt_month'].sum()
mean_item_price_all_shops = item_revenue_all_shops / item_count_all_shops
mean_item_price_all_shops = mean_item_price_all_shops.rename('mean_item_price_all_shops')

lags = [1,2,3,12]

# Missing entries backfilled then forward filled
for lag in lags:
    df = add_lag_feature(df, mean_item_price_all_shops, 'item_id', lag, fillna='ffill_bfill')

## Time since first / last sale of an item, or an item in a shop

Time since first appearance of an item can be calculated as a simple groupby operation.\
Other "time since" features can not be calculated in this way however. Use one of the other routines which also calculates this feature.

In [9]:
# Months since first appearance of item, calculated by a groupby-transform operation (fast!)
train['item_since_first_sale'] = train.groupby('item_id')['date_block_num'].transform(lambda x: x - x.min())
train['item_since_first_sale'] = train['item_since_first_sale'].fillna(0)

This cell calculates the time since the first sale of an item in a specific shop.\
Slow, better options are available.

In [None]:
# Time since first appearance of item at a specific shop
store = pd.DataFrame()
ts = time.time()
gruppy = train[['date_block_num','shop_id','item_id','item_cnt_month']].groupby(['shop_id','item_id'])
for group, items in gruppy:
    items['shop_item_since_first_sale'] = items.date_block_num - items.date_block_num[items.item_cnt_month>0].min()
    store = store.append(items)
items['shop_item_since_first_sale'] = items['shop_item_since_first_sale'].fillna(0)
time.time() - ts

Row iterating code for calculating time from first and last shop-item sale combination.\
Faster version using iteration over a groupby object is available.

In [None]:
matrix = train.copy()
ts = time.time()

# Shop / item combination section
cachefirst_shop_item = {}
cachelast_shop_item = {}
matrix['shop_item_last_sale'] = -1
matrix['shop_item_first_sale'] = -1
matrix['shop_item_last_sale'] = matrix['shop_item_last_sale'].astype(np.int8)
matrix['shop_item_first_sale'] = matrix['shop_item_first_sale'].astype(np.int8)
for idx, row in matrix.iterrows():    
    key = str(row.item_id)+' '+str(row.shop_id)
    if key not in cachefirst_shop_item:
        if row.item_cnt_month!=0:
            cachefirst_shop_item[key] = row.date_block_num
            cachelast_shop_item[key] = row.date_block_num
    else:
        last_date_block_num = cachelast_shop_item[key]
        first_date_block_num = cachefirst_shop_item[key]
        matrix.at[idx, 'shop_item_last_sale'] = row.date_block_num - last_date_block_num
        matrix.at[idx, 'shop_item_first_sale'] = row.date_block_num - first_date_block_num
        if row.item_cnt_month!=0:
            cachelast_shop_item[key] = row.date_block_num
time.time() - ts

#### Fast code for making time since first / last item sale features
Groupby iterator -> dictionary list -> feature dataframe -> merge

In [None]:
ts = time.time()

cachefirst_item = {}
cachelast_item = {}

storelist = []

# Use a groupby object to iterate over all date block / item combinations in the training data
gruppy = train[['date_block_num','shop_id','item_id','item_cnt_month']].groupby(['date_block_num','item_id'])

for group, items in gruppy:
    
    date_block_num = group[0]
    item_id = group[1]
    
    # Use item_id as a unique key for the cache dictionaries
    key = item_id
    if key not in cachefirst_item:
        cachefirst_item[key] = date_block_num
        cachelast_item[key] = date_block_num
    else:
        last_date_block_num = cachelast_item[key]
        first_date_block_num = cachefirst_item[key]
        # Features are stored in a list of dictionaries which is used to construct a dataframe, as this is much faster than appending features row-wise
        tempdict = {'date_block_num':date_block_num,
                   'item_id':item_id,
                   'item_last_sale': date_block_num - last_date_block_num,
                    'item_first_sale': date_block_num - first_date_block_num}
        cachelast_item[key] = date_block_num
        storelist.append(tempdict)

# Create a dataframe from the list
df = pd.DataFrame(storelist)

# Merge the new dataframe with main train frame
train = train.merge(df, on=['date_block_num','item_id'], how='left')

# Fill missing values with 0
train[['item_last_sale','item_first_sale']] = train[['item_last_sale','item_first_sale']].fillna(0)

time.time() - ts

#### Fast code for making time since first / last shop-item sale features

In [None]:
ts = time.time()

# Shop / item combination section
cachefirst_shop_item = {}
cachelast_shop_item = {}

storelist = []

# Use a groupby object to iterate over all date block / shop / item combinations in the training data
gruppy = train[['date_block_num','shop_id','item_id','item_cnt_month']].groupby(['date_block_num','shop_id','item_id'])

for group, items in tqdm(gruppy):
    
    date_block_num = group[0]
    shop_id = group[1]
    item_id = group[2]
    
    # Use item_id-shop_id combination as a unique key for the cache dictionaries
    key = str(item_id)+' '+str(shop_id)
    if key not in cachefirst_shop_item:
        # Update caches if item_cnt_month > 0
        # item_cnt_month is at final position in items frame (each group is a single row)
        if items.iat[0,-1]>0:
            cachefirst_shop_item[key] = date_block_num
            cachelast_shop_item[key] = date_block_num

    else:
        last_date_block_num = cachelast_shop_item[key]
        first_date_block_num = cachefirst_shop_item[key]
        # Features are stored in a list of dictionaries which is used to construct a dataframe, as this is much faster than appending features row-wise
        tempdict = {'date_block_num':date_block_num,
                    'shop_id':shop_id,
                   'item_id':item_id,
                   'shop_item_last_sale': date_block_num - last_date_block_num,
                    'shop_item_first_sale': date_block_num - first_date_block_num}
        if items.iat[0,-1]>0:
            cachelast_shop_item[key] = date_block_num
        storelist.append(tempdict)
        
# Create a dataframe from the list
df = pd.DataFrame(storelist)

# Merge the new dataframe with main train frame
train = train.merge(df, on=['date_block_num','shop_id','item_id'], how='left')

# Fill missing values with 0
train[['shop_item_last_sale','shop_item_first_sale']] = train[['shop_item_last_sale','shop_item_first_sale']].fillna(0)

del storelist, cachelast_shop_item, cachefirst_shop_item

time.time() - ts

## Shop age (date blocks since first appearance of shop)

In [None]:
# Months since first appearance of item, calculated by a groupby-transform operation (fast!)
df['shop_age'] = df.groupby('shop_id')['date_block_num'].transform(lambda x: x - x.min())

## Percentage change in an aggregate feature over a specified period
(e.g. percentage change in total shop revenue compared to 3 months previously)
Feature lagged by one month to maintain validity of time splits

Shop revenue change month on month \
Total item sales month on month

In [None]:
def add_pct_change(df, group_feats, quantity='item_cnt_month', agg_function='sum', periods=1, fill_method=None, fill_inf=True, clip_value=None):
    '''
    Adds a column of month-to-month proportion change values for an aggregate sum of a specific feature
    "quantity" grouped by feature "group_feats".
    
    Arguments:
    df (Dataframe): dataframe to add feature to
    group_feats (str or list): column or list of columns to group items by when generating aggregate features
    quantity (str): column to generate aggregate feature on
    agg_function (str): aggregate function to use, possible values "sum" or "mean"
    periods (int or str): time deltas to use when calculating percentage changes of the aggregate feature
    fill_method: argument passed to the fill_method argument of pandas.Series.pct_change(). NOTE this does
    not prevent NaNs existing in the generated column
    clip_value: positive absolute value used to set the negative and positive clip boundaries of the new feature
    '''
    
    # Put string arguments for arguments "periods" and "group_feats" into lists
    if type(periods) not in (int, list):
        raise TypeError
    if type(periods)==int:
        periods = [periods]
        
    if type(group_feats) not in (str, list):
        raise TypeError
    if type(group_feats)==str:
        group_feats = [group_feats]

    # group_feats_full: list of grouping columns + time column
    group_feats_full = ['date_block_num'] + group_feats
    
    # Create a template index of all group feature values in all months (original df misses levels in months where they have no sales)
    idx = pd.MultiIndex.from_product([df[col].unique() for col in group_feats_full],
                                    names=group_feats_full)
    template = pd.DataFrame(index=idx)
    template = template.sort_index()
    
    # Create aggregate feature and merge the results with the template
    if agg_function=='sum':
        aggs = df.groupby(group_feats_full)[quantity].sum()
    elif agg_function=='mean':
        aggs = df.groupby(group_feats_full)[quantity].mean()
    else:
        raise ValueError("Invalid agg_function!")
    template = template.merge(aggs, on=group_feats_full, how='left')
    
    # Generate pct_change feature for each specified period, clip values then add to df with a lag of 1
    for period in periods:
        feat_name = '-'.join(group_feats + [quantity] + [agg_function] + ['delta'] + [str(period)])
        template[feat_name] = (template.groupby(group_feats)[quantity]
                                .transform(lambda x: x.pct_change(periods=period, fill_method=fill_method)))
        
        # Sometimes inf values are created because of anomalies
        if fill_inf:
            template.loc[template[feat_name]==np.inf,feat_name] = np.nan
            template[feat_name] = template.groupby(group_feats)[feat_name].fillna(method=fill_method)
        
        # Censor large values
        if clip_value is not None:
            template[feat_name] = template[feat_name].clip(lower=-clip_value, upper=clip_value)
        
        # Add to df with a lag of 1
        if 'date_block_num' not in template.columns:
            template = template.reset_index()
        template['date_block_num'] += 1
        df = df.merge(template.loc[:,['date_block_num'] + group_feats + [feat_name]], on=group_feats_full, how='left')
    
    return df

## (Lagged) sales of items with item_id 1 above or 1 below item

In [49]:
def add_neighbor_item_features(matrix, feature, group= ['item_id']):
    # Item_id minus 1
    f = matrix.groupby(["date_block_num"] + group)[feature].agg("mean")
    name_minus = "minus1_" + feature
    f.name = name_minus
    f = f.reset_index(["date_block_num"] + group)
    f["item_id"] = f["item_id"] + 1
    f['date_block_num'] += 1
    matrix = matrix.merge(f, on=["date_block_num"] + group, how='left')
    matrix[name_minus] = matrix[name_minus].fillna(99)
    # Item_id plus 1
    name_plus = "plus1_" + feature
    f = f.rename(columns={name_minus:name_plus})
    f["item_id"] = f["item_id"] - 2
    f['date_block_num']
    matrix = matrix.merge(f, on=["date_block_num"] + group, how='left')
    matrix[name_plus] = matrix[name_plus].fillna(99)
    return matrix

## Select features according to feature importances in a trained lightgbm booster

In [None]:
def dropxfeatures(booster, n_drop):
    ''' Return a list of columns with the bottom n_drop feature importances removed.'''
    allfeats = booster.feature_name_
    featimportances = booster.feature_importances_
    idx = featimportances.argsort()[n_drop:]
    return list(np.array(allfeats)[idx])

## Split a musical item name into artist and title strings from musical items in the Kaggle "predict future sales" competition

In [46]:
def artist_title_split(item_name):
    '''
    Splits an item name string for a music item into an artist and title string.
    Not perfect.
    '''
    import re

    def name_correction(x):
        x = x.partition('[')[0]
        x = x.partition('(')[0]
        x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x)
        x = x.replace('  ', ' ')
        x = x.strip()
        return x

    def findnumericorvolidx(word_list, start):
        # Find the first string item in a list which is either numeric or begins with "Vol"
        numeric_or_vol_list = [word.isnumeric() | (word[:3]=='Vol') for word in word_list]
        try:
            title_start_idx = numeric_or_vol_list[start:].index(True) + start
        except ValueError:
            title_start_idx = None
        return title_start_idx

    name_clean = name_correction(item_name)
#     print(name_clean)
    word_list = name_clean.split()
    # Planet music case
    if name_clean[:12] == "Planet Music":
        artist = "Planet Music"
        title = " ".join(word_list[2:])
    # V/A case
    elif (name_clean[:3] == "V A"):
        title_start_idx = findnumericorvolidx(word_list, 2)
        artist = " ".join(word_list[2:title_start_idx])
        if title_start_idx==None:
            title=None
        else:
            title = " ".join(word_list[title_start_idx:])
    # СБ case
    elif (name_clean[:2] == "СБ"):
        title_start_idx = findnumericorvolidx(word_list, 1)
        artist = " ".join(word_list[1:title_start_idx])
        if title_start_idx==None:
            title=None
        else:
            title = " ".join(word_list[title_start_idx:])

    # Single word after name cleaning case       
    elif len(word_list)==1:
        artist = word_list[0]
        title = None

    # Normal case
    else:
#         start_points = [((len(word)>1) & (word.istitle())) | word.isnumeric() for word in word_list]
        start_points = [(word.istitle() | word.isnumeric() | word.islower()) for word in word_list]
#         start_points = [(word.istitle() | word.isnumeric()) & (len(word)>1) for word in word_list]

        try:
            title_start_idx = start_points[1:].index(True) + 1
        except ValueError:
            title_start_idx = len(start_points)-1

        artist = " ".join(word_list[:title_start_idx])
        title = " ".join(word_list[title_start_idx:])
    
    return (artist, title)

Try out the above function below

music_categories = [55, 56, 57, 58, 59]
music_items = items.loc[items.item_category_id.isin(music_categories),:].copy()
# music_items.to_csv('music_items.csv')

item_id = 17972
item_name = music_items.loc[music_items.item_id==item_id,'item_name'].item()

print(item_name)
artist, title = artist_title_split(item_name)
    
print(f"Artist: {artist}, title: {title}")
print(f"Just artist: {artist_title_split(item_name)[0]}")

## Rolling and exponential mean encodings

The moving and rolling average features are generated from the day-level sales_train dataframe which has additional zero item_cnt_day placeholder entries at the beginning and end of each set of unique shop-item combinations. The following cell creates or loads this.

In [None]:
def create_train_with_start_endpoints(train):
    # Need datetime index for this, moving averages are calculated per-day
    train["date"] = pd.to_datetime(train["date"], format="%d.%m.%Y")
    train = train.set_index("date", drop=False)
    # We want to create averages until the end of the shop's life,
    # for all item_id-shop combinations add zero item count items
    # with the last date that an item was sold in the shop
    storelist1 = []
    storelist2 = []
    start_last_month = pd.Timestamp("2015-10-01")
    end_last_month = pd.Timestamp("2015-10-31")
    for shop_id in tqdm(train.shop_id.unique()):
        first_shop_sale_date = train.loc[
            train.shop_id == shop_id, "date"
        ].min()
        last_shop_sale_date = train.loc[
            train.shop_id == shop_id, "date"
        ].max()
        # If last sale is in last month of training data, add placeholder
        # at the end of the month so timeseries extends to the beginning
        # of the test data
        if last_shop_sale_date >= start_last_month:
            last_shop_sale_date = end_last_month
        first_shop_sale_date_block_num = train.loc[
            train.shop_id == shop_id, "date_block_num"
        ].min()
        last_shop_sale_date_block_num = train.loc[
            train.shop_id == shop_id, "date_block_num"
        ].max()
        for item_id in train[train.shop_id == shop_id].item_id.unique():
            first_item_sale_date = train.loc[
                train.item_id == item_id, "date"
            ].min()
            first_item_sale_date_block_num = train.loc[
                train.item_id == item_id, "date_block_num"
            ].min()
            endpoint_null_sale = {
                "date": last_shop_sale_date,
                "shop_id": shop_id,
                "item_id": item_id,
                "item_cnt_day": 0.0,
                "date_block_num": last_shop_sale_date_block_num,
            }
            startpoint_date = max(
                first_shop_sale_date, first_item_sale_date
            )
            startpoint_date_block_num = max(
                first_shop_sale_date_block_num,
                first_item_sale_date_block_num,
            )
            startpoint_null_sale = {
                "date": startpoint_date,
                "shop_id": shop_id,
                "item_id": item_id,
                "item_cnt_day": 0.0,
                "date_block_num": startpoint_date_block_num,
            }
            storelist1.append(endpoint_null_sale)
            storelist2.append(startpoint_null_sale)
    # Create a dataframe of the endpoints from the list of dictionaries
    endpoint_items = pd.DataFrame(storelist1)
    endpoint_items = endpoint_items.set_index("date", drop=False)
    startpoint_items = pd.DataFrame(storelist2)
    startpoint_items = startpoint_items.set_index("date", drop=False)
    # Add the endpoints to the original dataset
    train_extra = pd.concat([startpoint_items, train, endpoint_items])
    return train_extra

train_extra = create_train_with_start_endpoints(train)

### Remove negative sales values from dataframe used to create rolling average features
These can cause some strange negative mean monthly sales values when items are returned long after they are purchased, so I'm guessing it's better to remove these.

In [169]:
train_extra = train_extra[train_extra.item_cnt_day>=0]

### Define rolling average functions

In [170]:
def rolling_average(s, readout_dates, cutoff=None):
    """ Calculates rolling averages on specified readout dates from a sparse dataseries s,
    assuming zero sales on days with no entries.

    Arguments
    ---------
    s: pandas series of scalar data with datetime index
    readout_dates: an array of pandas datetimes which specify the dates on which the value of the rolling average will be calculated
    cutoff: scalar, date offset from readout point beyond which events do not contribute to readouts.
    """
    if cutoff is None:
        cutoff = np.inf

    # Convert event dates numpy datetime64 datatype with day resolution
    event_dates = s.index.to_numpy("datetime64[D]").reshape((-1, 1))
    readout_dates = readout_dates.to_numpy("datetime64[D]").reshape((1, -1))

    # Create a n_event_dates * n_readout_dates array of date offsets between the readout dates and event dates
    offsets = (readout_dates - event_dates).astype("int")
    # Create a binary mask for the values used to calculate readout values (i.e. no future timepoints or past timepoints beyond the cutoff)
    selection_mask = (offsets >= 0) & (offsets < cutoff)
    # Calculate the window length as either the maximum window length or (readout date) - (start of series)
    window_lengths = offsets[0, :].clip(max=cutoff)
    normalization_factors = window_lengths
    # Create a len(series) x len(readout_dates) masked array of series values by tiling
    values = np.broadcast_to(
        s.to_numpy().reshape((-1, 1)), shape=offsets.shape
    )
    values = ma.array(data=values, mask=~selection_mask)
    # Sume the unmasked values and divide by the window length to get the mean
    readout_values = values.sum(axis=0) / normalization_factors
    readout_values = readout_values.data
    # Return a dataframe
    readout_series = pd.Series(
        data=readout_values, index=readout_dates.ravel(), name=s.name
    )
    return readout_series


def rolling_average_monthly_readout(series, cutoff=None):
    '''Creates a set of monthly readout points for an series and calls
    the rolling average function using the readout points as an argument.'''

    idx = series.index
    readout_dates = pd.date_range(
        start=idx.min(), end=idx.max() + MonthEnd(0), freq="M"
    )
    readout_series = rolling_average(series, readout_dates, cutoff=cutoff)
    return readout_series

In [171]:
def rm_shop_item(cutoff, name):
    ''' Returns a multiindex series of rolling mean series of all shop_id-item_id combinations
    with window length "cutoff" and name "name" '''

    def get_resampled(shop_id, item_id, cutoff):
        s = train_extra.loc[
            (train_extra["item_id"] == item_id) & (train_extra["shop_id"] == shop_id),
            "item_cnt_day",
        ]
        s = rolling_average_monthly_readout(s, cutoff=cutoff)

        return {"item_cnt_day": s, "item_id": item_id, "shop_id": shop_id}

    storelist = []
    for shop_id in tqdm(train_extra.shop_id.unique()):
        for item_id in train_extra[train_extra.shop_id == shop_id].item_id.unique():
            storelist.append(get_resampled(shop_id, item_id, cutoff))
    pool = mp.Pool()
    storelist2 = list(pool.map(pd.DataFrame, storelist))
    feat_frame = pd.concat(storelist2)
    feat_frame = feat_frame.rename(columns={"item_cnt_day": name})
    dbms = train_extra.resample("m").date_block_num.mean()
    feat_frame = feat_frame.merge(dbms, how="left", left_index=True, right_index=True)
    feat_frame = feat_frame.set_index(["item_id", "shop_id", "date_block_num"])
    feat_series = feat_frame[name]
    return feat_series

In [172]:
def rm_single_feature(cutoff, encode_feature, return_name):
    """ Encodes shop_id as a rolling mean of item_cnt_day,
    sampled at the end of each month, with a window size of cutoff days.

    Parameters
    ----------
    cutoff: int, window length in days
    encode_feature: str, return_name of feature to be encoded
    return_name: name of feature that will be returned
    """

    def get_resampled(encode_feature, feature_id, cutoff):
        s = train_extra.loc[(train_extra[encode_feature] == feature_id), "item_cnt_day"]
        s = rolling_average_monthly_readout(s, cutoff=cutoff)

        return {"item_cnt_day": s, encode_feature: feature_id}

    storelist = []
    for feature_id in tqdm(train_extra[encode_feature].unique()):
        storelist.append(get_resampled(encode_feature, feature_id, cutoff))
    pool = mp.Pool()
    storelist2 = list(pool.map(pd.DataFrame, storelist))
    feat_frame = pd.concat(storelist2)
    feat_frame = feat_frame.rename(columns={"item_cnt_day": return_name})
    dbms = train_extra.resample("m").date_block_num.mean()
    feat_frame = feat_frame.merge(dbms, how="left", left_index=True, right_index=True)
    feat_frame = feat_frame.set_index([encode_feature, "date_block_num"])
    feat_series = feat_frame[return_name]
    return feat_series

In [173]:
def item_id_group_merge(df, feat_series):
    ''' Groups by date_block_num and item_id, merges with dataframe with lag 1 '''
    feat_series = feat_series[
        feat_series.index.get_level_values("shop_id").isin(df.shop_id.unique())
    ]
    feat_series = feat_series.groupby(["date_block_num", "item_id"]).mean()
    feat_series.name = 'rm_item_' + feat_series.name[feat_series.name.index('win'):]
    df = add_lag_feature(df, feat_series, "item_id", 1, fillna=0, optimize_mem=False)
    return df

## Now the same but with exponential moving average features

In [None]:
def exp_moving_average(s, readout_dates, alpha=None, halflife=None, cutoff=None):
    """ Calculates exonential moving averages as specified readout dates as a closed form function of the daily sales data,
    assuming zero sales on days with no entries. Either alpha or halflife parameters must be provided, see
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html for definition.
    
    Arguments
    s: pandas series of scalar data with datetime index
    readout_dates: an array of pandas datetimes which specify the dates on which the value of the moving average will be calculated
    alpha: scalar
    halflife: scalar
    cutoff: scalar, date offset from readout point beyond which events do not contribute to readouts, to potentially speed up calculation over long time series.
    """
    if cutoff is None:
        cutoff = np.inf
    if halflife is not None:
        alpha = 1 - np.e ** (-np.log(2) / halflife)
    if alpha is None:
        raise ValueError("Either alpha or halflife parameters must be provided")
    # Convert event dates numpy datetime64 datatype with day resolution
    event_dates = s.index.to_numpy("datetime64[D]").reshape((-1, 1))
    readout_dates = readout_dates.to_numpy("datetime64[D]").reshape((1, -1))

    # Create a n_event_dates * n_readout_dates array of date offsets between the readout dates and event dates
    offsets = (readout_dates - event_dates).astype("int")
    # Create a binary mask for the values used to calculate readout values (i.e. no future timepoints or past timepoints beyond the cutoff)
    selection_mask = (offsets >= 0) & (offsets < cutoff)
    # Create a masked array to hold the values of coefficients of the series values for the calculation of moving averages at the readoutpoints
    coeffs = ma.zeros(shape=offsets.shape)
    coeffs.mask = ~selection_mask
    # Calculate the coefficients of the past values as a function of their offset (i.e. the number of timepoints they are in the past)
    # See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html for equation
    coeffs[selection_mask] = (1 - alpha) ** offsets[selection_mask]
    # For the definition of the normalization factor see # See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html for equation
    # Calculate the length of the window being averaged over as larger of either the cutoff delay or the time since the first value in the series
    window_lengths = offsets[0, :].clip(max=cutoff)
    # For the closed form formula for the sum of 0 to n powers of a value, see https://en.wikipedia.org/wiki/Geometric_series#Closed-form_formula
    normalization_factors = (1 - (1 - alpha) ** (window_lengths + 1)) / (1 - (1 - alpha))
    # Create the weighted values as the product of original values and coefficients, then normalize to get the readout values
    weighted_values = coeffs * s.to_numpy().reshape((-1, 1))
    readout_values = weighted_values.sum(axis=0) / normalization_factors
    readout_values = readout_values.data
    # Return a dataframe
    readout_series = pd.Series(
        data=readout_values, index=readout_dates.ravel(), name=s.name
    )
    return readout_series


def moving_average_monthly_readout(series, alpha=None, halflife=None, cutoff=None):
    """Creates a set of monthly readout points for an series and calls
    the moving average function using the readout points as an argument."""
    idx = series.index
    readout_dates = pd.date_range(start=idx.min(), end=idx.max() + MonthEnd(0), freq="M")
    readout_series = exp_moving_average(
        series, readout_dates, alpha=alpha, halflife=halflife, cutoff=cutoff
    )
    return readout_series


def ema_shop_item(halflife, name):
    """ Returns a multiindex series of exponential moving mean series for all shop_id-item_id combinations,
    with window length "cutoff" and name "name" """

    def get_resampled(shop_id, item_id, halflife):
        s = train_extra.loc[
            (train_extra["item_id"] == item_id) & (train_extra["shop_id"] == shop_id),
            "item_cnt_day",
        ]
        s = moving_average_monthly_readout(s, halflife=halflife)
        return {"item_cnt_day": s, "item_id": item_id, "shop_id": shop_id}

    storelist = []
    for shop_id in tqdm(train_extra.shop_id.unique()):
        for item_id in train_extra[train_extra.shop_id == shop_id].item_id.unique():
            storelist.append(get_resampled(shop_id, item_id, halflife))
    pool = mp.Pool()
    storelist2 = list(pool.map(pd.DataFrame, storelist))
    feat_frame = pd.concat(storelist2)
    feat_frame = feat_frame.rename(columns={"item_cnt_day": name})
    dbms = train_extra.resample("m").date_block_num.mean()
    feat_frame = feat_frame.merge(dbms, how="left", left_index=True, right_index=True)
    feat_frame = feat_frame.set_index(["item_id", "shop_id", "date_block_num"])
    feat_series = feat_frame[name]
    return feat_series


def ema_single_feature(halflife, encode_feature, return_name):
    """ Encodes shop_id as a rolling mean of item_cnt_day,
    sampled at the end of each month, with a window size of cutoff days.

    Parameters
    ----------
    cutoff: int, window length in days
    encode_feature: str, return_name of feature to be encoded
    return_name: name of feature that will be returned
    """

    def get_resampled(encode_feature, feature_id, halflife):
        s = train_extra.loc[(train_extra[encode_feature] == feature_id), "item_cnt_day"]
        s = moving_average_monthly_readout(s, halflife=halflife)

        return {"item_cnt_day": s, encode_feature: feature_id}

    storelist = []
    for feature_id in tqdm(train_extra[encode_feature].unique()):
        storelist.append(get_resampled(encode_feature, feature_id, halflife))
    pool = mp.Pool()
    storelist2 = list(pool.map(pd.DataFrame, storelist))
    feat_frame = pd.concat(storelist2)
    feat_frame = feat_frame.rename(columns={"item_cnt_day": return_name})
    dbms = train_extra.resample("m").date_block_num.mean()
    feat_frame = feat_frame.merge(dbms, how="left", left_index=True, right_index=True)
    feat_frame = feat_frame.set_index([encode_feature, "date_block_num"])
    feat_series = feat_frame[return_name]
    return feat_series


def item_id_group_merge(df, feat_series):
    """ Groups by date_block_num and item_id, merges with dataframe with lag 1 """
    feat_series = feat_series[
        feat_series.index.get_level_values("shop_id").isin(df.shop_id.unique())
    ]
    feat_series = feat_series.groupby(["date_block_num", "item_id"]).mean()
    feat_series.name = "ema_item_" + feat_series.name[feat_series.name.index("hl") :]
    df = add_lag_feature(df, feat_series, "item_id", 1, fillna=0, optimize_mem=False)
    return df

## Similar name item sales feature for new items

In [8]:
def item_name_similarity_frame(itemsdf):
    # Makes a dataframe of similarity values between item names calculated with FuzzyWuzzy
    # itemsdf must have a supercategory_id field
    import re
    from itertools import combinations

    import scipy.sparse as sp
    from fuzzywuzzy import fuzz

    def strip_sq_brackets(string):
        return re.sub(r"\[.*?\]", "", string)

    def strip_rd_brackets(string):
        return re.sub(r"\(.*?\)", "", string)

    items["item_name"] = items["item_name"].apply(strip_sq_brackets).apply(strip_rd_brackets)

    itnames = items.item_name.to_list()
    itsupcats = items.supercategory_id.to_list()
    pairs = combinations(items.index, 2)
    sims = sp.dok_matrix((len(itnames), len(itnames)), dtype=np.int8)

    for id1, id2 in tqdm(
        pairs, total=len(itnames) * (len(itnames) - 1) / 2, desc="Calculating similarity values"
    ):
        if itsupcats[id1] != itsupcats[id2]:
            pass
        else:
            sims[id1, id2] = fuzz.token_sort_ratio(itnames[id1], itnames[id2], force_ascii=False)

    csims = sims.tocsr()
    csims = csims + csims.T
    csims = pd.DataFrame.sparse.from_spmatrix(csims)
    return csims


def make_sim_item_features(matrix, sim_frame, return_fields, sim_thresh=50, max_n=5, max_item_age=12):
    storelist = []
    mean_fields = [s + "_all_shops" for s in return_fields]
    for date_block_num in tqdm(range(3, 35), "Generating similar item name features"):
        for item_age in np.sort(matrix.loc[matrix.date_block_num == date_block_num, "item_age"].unique()):

            def get_sim_item_features(item_ids):
                slist = []
                for item_id in item_ids:
                    sim_items = sim_frame.loc[item_id, item_ids_past].nlargest(max_n)
                    sim_items = sim_items[sim_items>sim_thresh]
                    if len(sim_items)==0:
                        pass
                    else:
                        sim_item_values = (
                            past_months.loc[(slice(None), sim_items.index), return_fields]
                            .groupby("shop_id")[return_fields]
                            .mean()
                        )
                        sim_item_values[mean_fields] = sim_item_values.mean()
                        sim_item_values["item_id"] = item_id
                        slist.append(sim_item_values)
                sframe = pd.concat(slist)
                sframe["date_block_num"] = date_block_num
                sframe = sframe.reset_index()
                return sframe
            if item_age < date_block_num:
                if item_age >= max_item_age:
                    past_months = (
                        matrix.query(f"date_block_num<{date_block_num} & date_block_num>1 & item_age>={item_age}")
                        .groupby(["shop_id", "item_id"])[return_fields]
                        .mean()
                    )
                    item_ids_past = past_months.index.get_level_values("item_id").unique()
                    item_ids = matrix.query(
                        f"date_block_num=={date_block_num} & item_age>={item_age}"
                    ).item_id.unique()
                    sframe_new = get_sim_item_features(item_ids)
                    storelist = storelist + [sframe_new]
                    break
                else:
                    past_months = (
                        matrix.query(f"date_block_num<{date_block_num} & date_block_num>1 & item_age=={item_age}")
                        .groupby(["shop_id", "item_id"])[return_fields]
                        .mean()
                    )
                    item_ids_past = past_months.index.get_level_values("item_id").unique()
                    item_ids = matrix.query(
                        f"date_block_num=={date_block_num} & item_age=={item_age}"
                    ).item_id.unique()
                    sframe_new = get_sim_item_features(item_ids)
                    storelist = storelist + [sframe_new]
    sim_item_features = pd.concat(storelist)
    sim_item_field_names = {s: "sim_item_name_" + s for s in return_fields + mean_fields}
    sim_item_features = sim_item_features.rename(columns=sim_item_field_names)
    return sim_item_features


def add_sim_item_features(matrix, sim_item_features, fill_na_val=None):
    oldcols = matrix.columns
    matrix = matrix.merge(sim_item_features, on=["date_block_num", "shop_id", "item_id"], how="left")
    newcols = matrix.columns.difference(oldcols)
    if fill_na_val is not None:
        matrix[newcols] = matrix[newcols].fillna(fill_na_val)
    return matrix

## Extract artist name or comic type with regex and add as feature

### Artist name feature for music categories and comic type feature for the books - comics category

In [51]:
def add_name_features(matrix):
    # This extracts artist names for music categories and adds them as a feature.
    # Assumes that the items table is in data/items.csv
    def extract_artist(st):
        import re

        st = st.strip()
        if st.startswith("V/A"):
            artist = "V/A"
        elif st.startswith("СБ"):
            artist = "СБ"
        else:
            # Retrieves artist names using the double space or all uppercase pattern
            mus_artist_dubspace = re.compile(r".{2,}?(?=\s{2,})")
            match_dubspace = mus_artist_dubspace.match(st)
            mus_artist_capsonly = re.compile(r"^([^a-zа-я]+\s)+")
            match_capsonly = mus_artist_capsonly.match(st)
            candidates = [match_dubspace, match_capsonly]
            candidates = [m[0] for m in candidates if m is not None]
            # Sometimes one of the patterns catches some extra words so choose the shortest one
            if len(candidates):
                artist = min(candidates, key=len)
            else:
                # If neither of the previous patterns found something, use the dot-space pattern
                mus_artist_dotspace = re.compile(r".{2,}?(?=\.\s)")
                match = mus_artist_dotspace.match(st)
                if match:
                    artist = match[0]
                else:
                    artist = ""
        artist = artist.upper()
        artist = re.sub(r"[^A-ZА-Я ]||\bTHE\b", "", artist)
        artist = re.sub(r"\s{2,}", " ", artist)
        artist = artist.strip()
        return artist

    items = pd.read_csv("data/items.csv")
    music_categories = [55, 56, 57, 58, 59, 60]
    items.loc[
        items.item_category_id.isin(music_categories), "item_name_extra"
    ] = items.loc[items.item_category_id.isin(music_categories), "item_name"].apply(
        extract_artist
    )

    def get_comic_type(string):
        # The item_name in the comic category starts with the type of comic (manga etc)
        string = string.strip()
        brk = string.find(" ")
        if brk > 0:
            comic_type = string[:brk]
        else:
            comic_type = ""
        return comic_type

    comic_category = [47]
    items.loc[items.item_category_id.isin(comic_category), "item_name_extra"] = items.loc[
        items.item_category_id.isin(comic_category), "item_name"
    ].apply(get_comic_type)

    matrix = matrix.merge(
        items.loc[
            items.item_category_id.isin(music_categories + comic_category),
            ["item_id", "item_name_extra"],
        ],
        on="item_id",
        how="left",
    )
    return matrix

#### More spare regex code, may be used later

In [172]:
# Functions for name feature processing
import re
punct_num_set = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~0-9]'
sq_bracket_extract = re.compile(r'\[(.+?)\]') # Square brackets regex
rd_bracket_extract = re.compile(r'\((.+?)\)') # Round brackets regex

def remove_non_letter(string):
    return re.sub(punct_num_set, ' ', string)

def bracket_content_process(s):
    s = [s.split(',') for s in s]
    s = list(chain.from_iterable(s))
    s = [remove_non_letter(w) for w in s]
    s = [w.strip() for w in s]
    s = '|'.join(s)
    return s

# Use the two functions below to extract brackets
def extract_sq_bracket_contents(string):
    s = sq_bracket_extract.findall(string)
    s = bracket_content_process(s)
    return s

def extract_rd_bracket_contents(string):
    s = rd_bracket_extract.findall(string)
    s = bracket_content_process(s)
    return s

def strip_sq_brackets(string):
    return re.sub(r'\[.*?\]', '', string)

def strip_rd_brackets(string):
    return re.sub(r'\(.*?\)', '', string)

# redundant_terms = [
#     pc,
#     pс,
#     рс,
#     xbox,
#     ps3
#     psp
# ]

# Dataset splitting and manipulation snippets

In [None]:
def split_train_test(df, test_date_block_num):
    # Split off last month of a dataset as a test set, everything before as the train set
    test = df.loc[df.date_block_num==test_date_block_num,:]
    train = df.loc[df.date_block_num<test_date_block_num,:]
    return (train, test)

def create_full_train_test(sales_train, test):
    '''
    Create a train set with all items and all shops in all date blocks (i.e. months).
    Note this is not the exact sampling method used in the test set, as the test set has only the cartesian product of unique items and shops in the test month.
    '''
        # Aggregate sales to the month level
    sales_train_grouped = sales_train.groupby(['date_block_num','item_id','shop_id']).agg({'item_cnt_day':'sum','item_price':'mean'})
    sales_train_grouped = sales_train_grouped.rename(columns={'item_cnt_day':'item_cnt','item_price':'item_price_mean'})

    # Create sets of items, shops and date blocks
    item_ids = set(sales_train.item_id).union(set(test.item_id))
    shop_ids = set(sales_train.shop_id).union(set(test.shop_id))
    date_block_nums = set(sales_train.date_block_num).union(set([34]))

    # Create all permutations as indexes
    indexdataframe = pd.DataFrame(np.array(list(itertools.product(date_block_nums,item_ids,shop_ids))),columns=['date_block_num','item_id','shop_id'])

    fulltrain = indexdataframe.merge(sales_train_grouped,how='left',on=['date_block_num','item_id','shop_id'])
    fulltrain.item_cnt = fulltrain.item_cnt.fillna(0)

    # Verification code
    # after = fulltrain.loc[fulltrain.date_block_num<34,:].groupby('date_block_num').item_cnt.sum()
    # before = sales_train.groupby('date_block_num').item_cnt_day.sum()
    # (before == after).all()

    fulltrain = fulltrain.merge(items, on='item_id', how='left')
    fulltrain = fulltrain.drop(columns=['item_name','item_price_mean'])

    # Split train and test again
    train, test = split_train_test(fulltrain, 34)

    return train

def xysplit(train, test):
    # Split a train and test set into into x and y sets, with item_cnt as the target y variable
    y_train = train.item_cnt
    X_train = train.drop(columns=['item_cnt'])
    y_test = test.item_cnt
    X_test = test.drop(columns=['item_cnt'])
    return (X_train, y_train, X_test, y_test)

In [42]:
def create_testlike_train(sales_train, test = None):
    # Create a date_block_num / item_id / shop_id index using all combinations of item_id and shop_id occurring within each date_block
    # Optionally concatenate the test items to the end
    indexlist = []
    for i in sales_train.date_block_num.unique():
        x = itertools.product([i],
                              sales_train.loc[sales_train.date_block_num==i].item_id.unique(),
                              sales_train.loc[sales_train.date_block_num==i].shop_id.unique())
        indexlist.append(np.array(list(x)))

    df = pd.DataFrame(data=np.concatenate(indexlist, axis=0),
                      columns=['date_block_num', 'item_id', 'shop_id'])
    
    # Add revenue column to sales_train
    sales_train['item_revenue_day'] = sales_train['item_price'] * sales_train['item_cnt_day']
    
    # Aggregate item_id / shop_id item_cnts and revenue at the month level
    sales_train_grouped = sales_train.groupby(['date_block_num','item_id','shop_id']).agg(
                            item_cnt_month=pd.NamedAgg(column='item_cnt_day', aggfunc='sum'),
                            item_revenue_month=pd.NamedAgg(column='item_revenue_day', aggfunc='sum'))
    
    # Merge the grouped data with the index
    df = df.merge(sales_train_grouped,how='left',on=['date_block_num','item_id','shop_id'])

    if test is not None:
        df = pd.concat([df, test])
        df = df.drop(columns='ID')
        df['date_block_num'] = df['date_block_num'].fillna(34)
    
    # Fill empty item_cnt entries with 0
    df.item_cnt_month = df.item_cnt_month.fillna(0)
    df.item_revenue_month = df.item_revenue_month.fillna(0)
    
    return df.astype('int32')

## Function to optimize the memory use of a dataframe by downcasting datatypes

In [None]:
# Thanks You Guillaume Martin for the Awesome Memory Optimizer!
# https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else: df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


## monthwise time series cross validation 

In [3]:
import pandas as pd
import lightgbm as lgbm
matrix = pd.read_pickle("checkpoints/matrixcheckpoint_0.pk1")

from sklearn.model_selection import cross_validate

def time_split(start_date_block, last_date_block):
    for test_date_block in range(start_date_block, last_date_block+1):
        train_idx = matrix['date_block_num']<test_date_block
        test_idx = matrix['date_block_num']==test_date_block
        yield train_idx, test_idx

start_date_block = 30
last_date_block = 33
X = matrix.drop(columns='item_cnt_month')
y = matrix['item_cnt_month']
ts = time_split(start_date_block, last_date_block)
params = {
    "n_estimators": 200,
    "learning_rate": 0.1,
    "device_type": "gpu",
    "n_jobs": 5,
}
booster = lgbm.LGBMRegressor(**params)

scores = cross_validate(booster, X, y, scoring='neg_root_mean_squared_error', cv=ts)
print(scores)

{'fit_time': array([22.60605645, 22.62392259, 22.73343515, 24.87741923]), 'score_time': array([0.34364986, 0.34560537, 0.3004148 , 0.34813166]), 'test_score': array([-2.06428627, -1.96297644, -1.59296444, -1.28655077])}


## Feature selection with scikit-learn's cross-validated recursive feature elimination (RFECV)

In [None]:
from sklearn.feature_selection import RFECV


def time_split(start_date_block, last_date_block):
    for test_date_block in range(start_date_block, last_date_block + 1):
        train_idx = matrix["date_block_num"] < test_date_block
        test_idx = matrix["date_block_num"] == test_date_block
        yield train_idx, test_idx


cat_cols = [
    "shop_id",
    "item_category_id",
    "city_code",
    "month",
]


start_date_block = 33
last_date_block = 33
X = matrix.drop(
    columns=["item_revenue_month",
    "item_price",
    "item_cnt_month_unclipped",
    "item_cnt_day_avg",
    "new_item",
    "new_shop",
    "item_age",
    "shop_age",
    "digital",
    "interaction_new_item_digital",
    "item_cnt_month",]
)
y = matrix["item_cnt_month"]
ts = time_split(start_date_block, last_date_block)
params = {
    "n_estimators": 40,
    "learning_rate": 0.1,
    "device_type": "gpu",
    'metric': 'rmse',
    "n_jobs": 11,
    'num_leaves': 1023,
    'min_child_samples':10,
    'colsample_bytree':0.7,
}
booster = lgbm.LGBMRegressor(**params)
selector = RFECV(booster, step=1, cv=ts, scoring =  'neg_root_mean_squared_error')
selector = selector.fit(X, y)

selector.ranking_

# Model implementation

In [72]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error

basic = [
    "shop_id",
    "new_item",
    "new_shop",
    "item_category_id",
    "digital",
    "supercategory_id",
    "platform_id",
    "city_code",
    "month",
    "year",
]

# initialize data
# initialize Pool
train_pool = Pool(X_train, y_train, cat_features=basic)
test_pool = Pool(X_valid, cat_features=basic)

# specify the training parameters
model = CatBoostRegressor(
    #     iterations=200,
    #     learning_rate=1,
    early_stopping_rounds=30,
    loss_function="RMSE",
    task_type="GPU",
)
# train the model
model.fit(
    train_pool, eval_set=(X_valid, y_valid),
)
# make the prediction using the resulting model
preds = model.predict(test_pool)
rmse = mean_squared_error(y_valid, preds, squared=False)

print(rmse)

Learning rate set to 0.111577
0:	learn: 1.3947405	test: 1.3150546	best: 1.3150546 (0)	total: 728ms	remaining: 12m 7s
1:	learn: 1.3866864	test: 1.3036687	best: 1.3036687 (1)	total: 1.35s	remaining: 11m 14s
2:	learn: 1.3642924	test: 1.2876664	best: 1.2876664 (2)	total: 2.26s	remaining: 12m 31s
3:	learn: 1.3640395	test: 1.2875689	best: 1.2875689 (3)	total: 3.27s	remaining: 13m 34s
4:	learn: 1.3599991	test: 1.2823933	best: 1.2823933 (4)	total: 4s	remaining: 13m 15s
5:	learn: 1.3492494	test: 1.2692452	best: 1.2692452 (5)	total: 4.68s	remaining: 12m 55s
6:	learn: 1.3381118	test: 1.2601970	best: 1.2601970 (6)	total: 5.64s	remaining: 13m 19s
7:	learn: 1.3315933	test: 1.2561025	best: 1.2561025 (7)	total: 6.01s	remaining: 12m 25s
8:	learn: 1.3171010	test: 1.2472730	best: 1.2472730 (8)	total: 6.65s	remaining: 12m 12s
9:	learn: 1.3132175	test: 1.2443963	best: 1.2443963 (9)	total: 7.2s	remaining: 11m 52s
10:	learn: 1.3091346	test: 1.2418928	best: 1.2418928 (10)	total: 8.06s	remaining: 12m 4s
11:	le

# Postprocessing 

In [67]:
print(f"Mean predicted sales of digital items in non-digital shops is {(~X_test.shop_id.isin([12, 55])) & (X_test.digital==1)].item_cnt_month.mean()}")
print(f"Mean predicted sales of non-digital items in digital shop 55 is {X_test[(X_test.shop_id==55) & (X_test.digital==0)].item_cnt_month.mean()}")

Mean predicted sales of digital items in non-digital shops is 0.0032863835924329724
Mean predicted sales of non-digital items in digital shop 55 is 0.017218524949990644


In [91]:
from sklearn.metrics import mean_squared_error
X_valid['item_cnt_month'] = booster.predict(X_valid.drop(columns=dropcols))
rmse = mean_squared_error(y_valid, X_valid['item_cnt_month'].clip(0,20), squared=False)
print(rmse)

0.7060136244246081


In [93]:
X_valid.loc[(~X_valid.shop_id.isin([12, 55])) & (X_valid.digital==1), 'item_cnt_month'] = 0
X_valid.loc[(X_valid.shop_id==55) & (X_valid.digital!=1), 'item_cnt_month'] = 0

In [94]:
rmse = mean_squared_error(y_valid, X_valid['item_cnt_month'].clip(0,20), squared=False)
print(rmse)

0.7059638564032783


Optional: replace the predictions for shop 36 (if any) with the predictions for shop 37 from the same city.

In [149]:
shop37 = X_test.loc[X_test.shop_id==37,:]
X_test = X_test.loc[X_test.shop_id!=36,:]
shop37.loc[:,'shop_id'] = 36
X_test = pd.concat([X_test,shop37])