> Martin Fridrich, 03/2022 

# Final project


### Data loading and initial transformations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
from itertools import product
import re
from fuzzywuzzy import fuzz

In [None]:
DATA_DIR = "../data/final_project/"
# sales
sales_train = pd.read_csv(DATA_DIR+"sales_train.csv")
sales_test = pd.read_csv(DATA_DIR+"test.csv")
# addional info
items = pd.read_csv(DATA_DIR+"items.csv")
item_categories = pd.read_csv(DATA_DIR+"item_categories.csv")
shops = pd.read_csv(DATA_DIR+"shops.csv")

In [None]:
# downcast dataframe
def optimize_numeric_dtypes(df):
    float_cols = df.select_dtypes("float").columns
    int_cols = df.select_dtypes("integer").columns
    df.loc[:,float_cols] = df.loc[:, float_cols].\
        apply(pd.to_numeric, downcast="float")
    df.loc[:,int_cols] = df.loc[:, int_cols].\
        apply(pd.to_numeric, downcast="integer")
    return df

def optimize_categories(df):
    cat_cols = df.select_dtypes("object").columns
    df.loc[:,cat_cols] = df.loc[:,cat_cols].\
        apply(lambda x: x.astype("category").cat.codes)
    return df    

In [None]:
# duplicate shops
shop_duplicates_map = {0: 57, 1: 58, 11: 10, 40: 39}
sales_train["shop_id"] = sales_train["shop_id"].replace(shop_duplicates_map)
# datetime
sales_train["date"] = pd.to_datetime(sales_train["date"], format="%d.%m.%Y")
# remove first 2 months due to inconsistency
sales_train = sales_train[sales_train.date_block_num>20]
# filtering on test set
sales_train = sales_train.loc[sales_train.shop_id.isin(sales_test["shop_id"].unique()), :]
# Drop training items with extreme or negative prices or sales counts
sales_train = sales_train[((sales_train["item_price"] > 0) & (sales_train["item_price"] < 75000)) &
    ((sales_train["item_cnt_day"] > 0) & (sales_train["item_cnt_day"] < 750))]
# train cartesian product & inds
shop_item_month = product(sales_train.shop_id.unique(), sales_train.item_id.unique(),
    sales_train.date_block_num.unique())
sales_train_index = pd.DataFrame(list(shop_item_month),
    columns=["shop_id","item_id","date_block_num"])
# test inds
sales_test_index = sales_test[["shop_id","item_id"]].copy()
sales_test_index["date_block_num"] = 34
# inds
sales_index = pd.concat([sales_train_index, sales_test_index],
    ignore_index=True).reset_index(drop=True)
del sales_test, sales_train_index, sales_test_index
gc.collect()
# sales table
sales_train["revenue"] = sales_train["item_price"]*sales_train["item_cnt_day"]
sales = sales_train.groupby(["shop_id", "item_id", "date_block_num"], as_index=False).agg(
    total_sold=("item_cnt_day", sum),  sum_revenue=("revenue",sum),
    n_transactions = ("item_cnt_day",pd.Series.count))\
        .sort_values("date_block_num")
sales = sales_index.merge(sales, how="left",
    on=["shop_id", "item_id", "date_block_num"]).fillna(0)
sales = optimize_numeric_dtypes(sales)
del sales_train, sales_index; gc.collect()
sales.head(3) 

# Feature engineering

In [None]:
# item groups based on adjacency and similarity
items.item_name = items.item_name.apply(lambda x: re.sub("[^A-ZА-Яa-zа-я0-9 ]","", x))
items.item_name = items.item_name.apply(lambda x: re.sub("\\s+"," ", x))
item_names = pd.concat([items.item_id, items.item_name,
    items.item_id.shift().fillna(0), items.item_name.shift().fillna("")], axis=1)
item_names.columns = ["item_id", "item_name", "previous_item_id", "previous_item_name"]
item_names["similarity"] = item_names.apply(\
     lambda x: fuzz.ratio(x["item_name"],x["previous_item_name"]), axis=1)
item_names["group_start"] = item_names["similarity"] <= 75
item_group_map = item_names.loc[item_names.group_start,["item_id"]]
item_group_map.columns = ["group_start"]
item_group_map["group_end"] = item_group_map["group_start"].shift(-1).\
    fillna(item_group_map["group_start"].max()+1).astype("int")
item_group_map["item_sim_id"] = list(range(len(item_group_map)))
# remap
items = items.merge(item_group_map, how="left",
    left_on=["item_id"], right_on=["group_start"])
items = items.sort_values("item_id")
items["item_sim_id"] = items["item_sim_id"].fillna(method="ffill").astype("int")
del item_names, item_group_map; gc.collect()
items = items[["item_id","item_name", "item_category_id", "item_sim_id"]]
# name length
items["item_name_len"] = items.item_name.apply(len).astype("int16")
items.head(3)

In [None]:
# categories
cat_splits = item_categories.item_category_name.apply(lambda x: x.split(" - "))
item_categories = pd.concat([item_categories,
    pd.DataFrame([c if len(c)>1 else [c[0],""] for c in cat_splits],
        columns=["parent_cat","child_cat"])], axis=1)
items = items.merge(item_categories, how="inner")
# return categorical encoding
items = optimize_categories(items[["item_id", "item_category_id", "item_sim_id",
    "item_name_len", "parent_cat", "child_cat"]])
# add categories
sales = sales.merge(items, how="inner", on="item_id")
del item_categories, items; gc.collect()
sales.head(3)

In [None]:
# date features
def working_days_count(from_date, to_date=None):
    import pandas as pd
    if to_date is None:
        to_date=from_date+pd.DateOffset(months=1)
    temp_date=from_date
    cnt=0
    while temp_date<to_date:
        if temp_date.weekday()<5:
            cnt+=1
        temp_date+=pd.DateOffset(days=1)
    return ((to_date-from_date).days, cnt, (to_date-from_date).days-cnt)
dates = pd.DataFrame(sales.date_block_num.unique(), columns=["date_block_num"])
dates["date"] = dates["date_block_num"].apply(lambda x: pd.to_datetime("01/01/2013")+\
    pd.DateOffset(months=x))
dates["year"] = dates.date.dt.year
dates["month"] = dates.date.dt.month
dates = optimize_numeric_dtypes(pd.concat([dates, pd.DataFrame.from_records(
        dates["date"].apply(lambda x: working_days_count(x)),
    columns=["total_days", "working_days", "weekend_days"])], axis=1))
sales = sales.merge(dates[["date_block_num","year","month","total_days", "working_days",
    "weekend_days"]], how="inner", on="date_block_num")
del dates; gc.collect()
sales.head(3)    

In [None]:
# pricing overall and inside cat
item_pricing = sales.groupby(["date_block_num","item_id","item_category_id"], as_index=False).\
    agg(total_sold=("total_sold", sum),  sum_revenue=("sum_revenue",sum))
item_pricing.loc[:,"average_price"] = item_pricing["sum_revenue"]/item_pricing["total_sold"]
item_pricing.loc[:,"average_price"]  = item_pricing.groupby(["item_id"])["average_price"].\
    apply(lambda x: x.fillna(method="ffill").fillna(method="bfill"))
# add price changes
item_pricing.loc[:,"price_change"] = item_pricing.groupby(["item_id"])["average_price"].\
    apply(lambda x: x/x.shift()).fillna(1)
# add position within the category
item_pricing.loc[:,"price_perc"] = item_pricing.groupby(["item_category_id", "date_block_num"])\
    ["average_price"].apply(lambda x: x.rank(pct=True))
item_pricing = optimize_numeric_dtypes(item_pricing[["date_block_num", "item_id", "price_change",
    "price_perc"]])
sales = sales.merge(item_pricing, how="left",
    on=["date_block_num", "item_id"])
del item_pricing; gc.collect()
sales.head(3)

In [None]:
# shop location
shops["city_id"] = shops.shop_name.apply(lambda x:\
    re.sub("[^A-ZА-Яa-zа-я0-9 ]","", x.split(" ")[0]))
shops["city_code_id"] = shops.shop_name.apply(lambda x:\
    re.sub("[^A-ZА-Яa-zа-я0-9 ]","", x.split(" ")[1]))
# keep wrong codes, so city and code are not the same
# shops["city_code"][shops.city=="Якутск"] = "ТЦ"
shops = optimize_categories(shops[["shop_id", "city_id", "city_code_id"]])
sales = sales.merge(shops, how="inner", on="shop_id")
del shops; gc.collect()
sales.loc[:,"target"] = sales.groupby(["item_id","shop_id"])["total_sold"].\
    apply(lambda x: x.shift(-1)).fillna(0)
sales.head(3)

In [None]:
# time-series

# smoothed demand across various cats
# city
sales["city_demand"] = sales.groupby(["city_id","date_block_num"])["total_sold"].apply(lambda x:
    x.ewm(span=3, min_periods=5).mean()).fillna(0).astype("float32")
# shop
sales["shop_demand"] = sales.groupby(["shop_id","date_block_num"])["total_sold"].apply(lambda x:
    x.ewm(span=3, min_periods=5).mean()).fillna(0).astype("float32")
# category
sales["cat_demand"] = sales.groupby(["item_category_id","date_block_num"])["total_sold"].apply(lambda x:
    x.ewm(span=3, min_periods=5).mean()).fillna(0).astype("float32")
# item
sales["item_demand"] = sales.groupby(["item_id","date_block_num"])["total_sold"].apply(lambda x:
    x.ewm(span=3, min_periods=5).mean()).fillna(0).astype("float32")
sales = optimize_numeric_dtypes(sales)

In [None]:
lags = [1,2,3,6]
for l in lags:
    sales.loc[:,"lag"+str(l)+"_total_sold"] = sales.groupby([sales.shop_id,sales.item_id])\
        ["total_sold"].shift(l).fillna(0)
    sales.loc[:,"lag"+str(l)+"_diff"] = sales.loc[:,"total_sold"]-\
        sales.loc[:,"lag"+str(l)+"_total_sold"] 

# Modeling pipeline

In [None]:
target = ["target"]
features = [c not in target for c in sales.columns]

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

last_block = 33
train = sales[sales.date_block_num<last_block]
test = sales[sales.date_block_num==last_block]
gbm = RandomForestRegressor(n_jobs=4)
gbm.fit(train.loc[:,features], train.loc[:,target])

train_rsme = np.mean((gbm.predict(train.loc[:,features])-train.loc[:,"target"])**2)**(1/2)
test_rsme = np.mean((gbm.predict(test.loc[:,features])-test.loc[:,"target"])**2)**(1/2)

In [None]:
# svm

# mlp

# random forest

# lgbm

# tpot

In [None]:
# stack

In [None]:
# validation 

In [None]:
# refitting

# Submission