> Martin Fridrich, 03/2022 

# Final project


### Data loading and initial transformations

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
from itertools import product
import re
from fuzzywuzzy import fuzz

In [2]:
DATA_DIR = "../data/final_project/"
# sales
sales_train = pd.read_csv(DATA_DIR+"sales_train.csv")
sales_test = pd.read_csv(DATA_DIR+"test.csv")
# addional info
items = pd.read_csv(DATA_DIR+"items.csv")
item_categories = pd.read_csv(DATA_DIR+"item_categories.csv")
shops = pd.read_csv(DATA_DIR+"shops.csv")

In [3]:
# downcast dataframe
def optimize_numeric_dtypes(df):
    float_cols = df.select_dtypes("float").columns
    int_cols = df.select_dtypes("integer").columns
    df.loc[:,float_cols] = df.loc[:, float_cols].\
        apply(pd.to_numeric, downcast="float")
    df.loc[:,int_cols] = df.loc[:, int_cols].\
        apply(pd.to_numeric, downcast="integer")
    return df

def optimize_categories(df):
    cat_cols = df.select_dtypes("object").columns
    df.loc[:,cat_cols] = df.loc[:,cat_cols].\
        apply(lambda x: x.astype("category").cat.codes)
    return df    

In [4]:
# duplicate shops
shop_duplicates_map = {0: 57, 1: 58, 11: 10, 40: 39}
sales_train["shop_id"] = sales_train["shop_id"].replace(shop_duplicates_map)
# datetime
sales_train["date"] = pd.to_datetime(sales_train["date"], format="%d.%m.%Y")
# remove first 2 months due to inconsistency
sales_train = sales_train[sales_train.date_block_num>1]
# filtering on test set
sales_train = sales_train.loc[sales_train.shop_id.isin(sales_test["shop_id"].unique()), :]
# Drop training items with extreme or negative prices or sales counts
sales_train = sales_train[((sales_train["item_price"] > 0) & (sales_train["item_price"] < 75000)) &
    ((sales_train["item_cnt_day"] > 0) & (sales_train["item_cnt_day"] < 750))]
# train cartesian product & inds
shop_item_month = product(sales_train.shop_id.unique(), sales_train.item_id.unique(),
    sales_train.date_block_num.unique())
sales_train_index = pd.DataFrame(list(shop_item_month),
    columns=["shop_id","item_id","date_block_num"])
# test inds
sales_test_index = sales_test[["shop_id","item_id"]].copy()
sales_test_index["date_block_num"] = 34
# inds
sales_index = pd.concat([sales_train_index, sales_test_index],
    ignore_index=True).reset_index(drop=True)
del sales_test, sales_train_index, sales_test_index
gc.collect()
# sales table
sales_train["revenue"] = sales_train["item_price"]*sales_train["item_cnt_day"]
sales = sales_train.groupby(["shop_id", "item_id", "date_block_num"], as_index=False).agg(
    total_sold=("item_cnt_day", sum),  sum_revenue=("revenue",sum),
    n_transactions = ("item_cnt_day",pd.Series.count))\
        .sort_values("date_block_num")
sales = sales_index.merge(sales, how="left",
    on=["shop_id", "item_id", "date_block_num"]).fillna(0)
sales = optimize_numeric_dtypes(sales)
del sales_train, sales_index; gc.collect()
sales.head(3) 

Unnamed: 0,shop_id,item_id,date_block_num,total_sold,sum_revenue,n_transactions
0,5,20175,2,1.0,849.0,1.0
1,5,20175,3,0.0,0.0,0.0
2,5,20175,4,0.0,0.0,0.0


# Feature engineering

In [5]:
# item groups based on adjacency and similarity
items.item_name = items.item_name.apply(lambda x: re.sub("[^A-ZА-Яa-zа-я0-9 ]","", x))
items.item_name = items.item_name.apply(lambda x: re.sub("\\s+"," ", x))
item_names = pd.concat([items.item_id, items.item_name,
    items.item_id.shift().fillna(0), items.item_name.shift().fillna("")], axis=1)
item_names.columns = ["item_id", "item_name", "previous_item_id", "previous_item_name"]
item_names["similarity"] = item_names.apply(\
     lambda x: fuzz.ratio(x["item_name"],x["previous_item_name"]), axis=1)
item_names["group_start"] = item_names["similarity"] <= 75
item_group_map = item_names.loc[item_names.group_start,["item_id"]]
item_group_map.columns = ["group_start"]
item_group_map["group_end"] = item_group_map["group_start"].shift(-1).\
    fillna(item_group_map["group_start"].max()+1).astype("int")
item_group_map["item_sim_id"] = list(range(len(item_group_map)))
# remap
items = items.merge(item_group_map, how="left",
    left_on=["item_id"], right_on=["group_start"])
items = items.sort_values("item_id")
items["item_sim_id"] = items["item_sim_id"].fillna(method="ffill").astype("int")
del item_names, item_group_map; gc.collect()
items = items[["item_id","item_name", "item_category_id", "item_sim_id"]]
# name length
items["item_name_len"] = items.item_name.apply(len).astype("int16")
items.head(3)

Unnamed: 0,item_id,item_name,item_category_id,item_sim_id,item_name_len
0,0,ВО ВЛАСТИ НАВАЖДЕНИЯ ПЛАСТ D,40,0,29
1,1,ABBYY FineReader 12 Professional Edition Full ...,76,1,64
2,2,В ЛУЧАХ СЛАВЫ UNV D,40,2,19


In [6]:
# categories
cat_splits = item_categories.item_category_name.apply(lambda x: x.split(" - "))
item_categories = pd.concat([item_categories,
    pd.DataFrame([c if len(c)>1 else [c[0],""] for c in cat_splits],
        columns=["parent_cat","child_cat"])], axis=1)
items = items.merge(item_categories, how="inner")
# return categorical encoding
items = optimize_categories(items[["item_id", "item_category_id", "item_sim_id",
    "item_name_len", "parent_cat", "child_cat"]])
# add categories
sales = sales.merge(items, how="inner", on="item_id")
del item_categories, items; gc.collect()
sales.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


Unnamed: 0,shop_id,item_id,date_block_num,total_sold,sum_revenue,n_transactions,item_category_id,item_sim_id,item_name_len,parent_cat,child_cat
0,5,20175,2,1.0,849.0,1.0,69,13467,28,14,55
1,5,20175,3,0.0,0.0,0.0,69,13467,28,14,55
2,5,20175,4,0.0,0.0,0.0,69,13467,28,14,55


In [7]:
# date features
def working_days_count(from_date, to_date=None):
    import pandas as pd
    if to_date is None:
        to_date=from_date+pd.DateOffset(months=1)
    temp_date=from_date
    cnt=0
    while temp_date<to_date:
        if temp_date.weekday()<5:
            cnt+=1
        temp_date+=pd.DateOffset(days=1)
    return ((to_date-from_date).days, cnt, (to_date-from_date).days-cnt)
dates = pd.DataFrame(sales.date_block_num.unique(), columns=["date_block_num"])
dates["date"] = dates["date_block_num"].apply(lambda x: pd.to_datetime("01/01/2013")+\
    pd.DateOffset(months=x))
dates["year"] = dates.date.dt.year
dates["month"] = dates.date.dt.month
dates = optimize_numeric_dtypes(pd.concat([dates, pd.DataFrame.from_records(
        dates["date"].apply(lambda x: working_days_count(x)),
    columns=["total_days", "working_days", "weekend_days"])], axis=1))
sales = sales.merge(dates[["date_block_num","year","month","total_days", "working_days",
    "weekend_days"]], how="inner", on="date_block_num")
del dates; gc.collect()
sales.head(3)    

Unnamed: 0,shop_id,item_id,date_block_num,total_sold,sum_revenue,n_transactions,item_category_id,item_sim_id,item_name_len,parent_cat,child_cat,year,month,total_days,working_days,weekend_days
0,5,20175,2,1.0,849.0,1.0,69,13467,28,14,55,2013,3,31,21,10
1,2,20175,2,0.0,0.0,0.0,69,13467,28,14,55,2013,3,31,21,10
2,41,20175,2,0.0,0.0,0.0,69,13467,28,14,55,2013,3,31,21,10


In [8]:
# pricing overall and inside cat
item_pricing = sales.groupby(["date_block_num","item_id","item_category_id"], as_index=False).\
    agg(total_sold=("total_sold", sum),  sum_revenue=("sum_revenue",sum))
item_pricing.loc[:,"average_price"] = item_pricing["sum_revenue"]/item_pricing["total_sold"]
item_pricing.loc[:,"average_price"]  = item_pricing.groupby(["item_id"])["average_price"].\
    apply(lambda x: x.fillna(method="ffill").fillna(method="bfill"))
# add price changes
item_pricing.loc[:,"price_change"] = item_pricing.groupby(["item_id"])["average_price"].\
    apply(lambda x: x/x.shift()).fillna(1)
# add position within the category
item_pricing.loc[:,"price_perc"] = item_pricing.groupby(["item_category_id", "date_block_num"])\
    ["average_price"].apply(lambda x: x.rank(pct=True)).fillna(1)
item_pricing = optimize_numeric_dtypes(item_pricing[["date_block_num", "item_id", "price_change",
    "price_perc"]])
sales = sales.merge(item_pricing, how="left",
    on=["date_block_num", "item_id"])
del item_pricing; gc.collect()
sales.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


Unnamed: 0,shop_id,item_id,date_block_num,total_sold,sum_revenue,n_transactions,item_category_id,item_sim_id,item_name_len,parent_cat,child_cat,year,month,total_days,working_days,weekend_days,price_change,price_perc
0,5,20175,2,1.0,849.0,1.0,69,13467,28,14,55,2013,3,31,21,10,1.0,0.452459
1,2,20175,2,0.0,0.0,0.0,69,13467,28,14,55,2013,3,31,21,10,1.0,0.452459
2,41,20175,2,0.0,0.0,0.0,69,13467,28,14,55,2013,3,31,21,10,1.0,0.452459


In [9]:
# shop location
shops["city_id"] = shops.shop_name.apply(lambda x:\
    re.sub("[^A-ZА-Яa-zа-я0-9 ]","", x.split(" ")[0]))
shops["city_code_id"] = shops.shop_name.apply(lambda x:\
    re.sub("[^A-ZА-Яa-zа-я0-9 ]","", x.split(" ")[1]))
# keep wrong codes, so city and code are not the same
# shops["city_code"][shops.city=="Якутск"] = "ТЦ"
shops = optimize_categories(shops[["shop_id", "city_id", "city_code_id"]])
sales = sales.merge(shops, how="inner", on="shop_id")
del shops; gc.collect()
sales.loc[:,"target"] = sales.groupby(["item_id","shop_id"])["total_sold"].\
    apply(lambda x: x.shift(-1)).fillna(0)
sales.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


Unnamed: 0,shop_id,item_id,date_block_num,total_sold,sum_revenue,n_transactions,item_category_id,item_sim_id,item_name_len,parent_cat,...,year,month,total_days,working_days,weekend_days,price_change,price_perc,city_id,city_code_id,target
0,5,20175,2,1.0,849.0,1.0,69,13467,28,14,...,2013,3,31,21,10,1.0,0.452459,3,8,0.0
1,5,20083,2,2.0,298.0,2.0,40,13421,26,11,...,2013,3,31,21,10,1.0,0.331605,3,8,0.0
2,5,31,2,1.0,699.0,1.0,37,27,26,11,...,2013,3,31,21,10,1.0,0.841561,3,8,0.0


In [10]:
# time-series
# smoothed demand across various cats
# city
sales["city_demand"] = sales.groupby(["city_id","date_block_num"])["total_sold"].apply(lambda x:
    x.ewm(span=3, min_periods=5).mean()).fillna(0).astype("float32")
# shop
sales["shop_demand"] = sales.groupby(["shop_id","date_block_num"])["total_sold"].apply(lambda x:
    x.ewm(span=3, min_periods=5).mean()).fillna(0).astype("float32")
# category
sales["cat_demand"] = sales.groupby(["item_category_id","date_block_num"])["total_sold"].apply(lambda x:
    x.ewm(span=3, min_periods=5).mean()).fillna(0).astype("float32")
# item
sales["item_demand"] = sales.groupby(["item_id","date_block_num"])["total_sold"].apply(lambda x:
    x.ewm(span=3, min_periods=5).mean()).fillna(0).astype("float32")
sales = optimize_numeric_dtypes(sales)

In [12]:
lags = [1,2,3,6]
for l in lags:
    sales.loc[:,"lag"+str(l)+"_total_sold"] = sales.groupby([sales.shop_id,sales.item_id])\
        ["total_sold"].shift(l).fillna(0)
    sales.loc[:,"lag"+str(l)+"_diff"] = sales.loc[:,"total_sold"]-\
        sales.loc[:,"lag"+str(l)+"_total_sold"]

In [13]:
# push-down processed data
sales.to_csv(DATA_DIR+"features.zip")
del sales; gc.collect()

# Modeling pipeline

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer, PolynomialFeatures
from category_encoders import TargetEncoder,PolynomialEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import IncrementalPCA
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression,ElasticNetCV,ElasticNet,BayesianRidge
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [7]:
col_types = {'shop_id': 'int8', 'item_id': 'int16', 'date_block_num': 'int8',
 'total_sold': 'float32', 'sum_revenue': 'float32', 'n_transactions': 'int32',
 'item_category_id': 'int8', 'item_sim_id': 'int16', 'item_name_len': 'int16',
 'parent_cat': 'int8', 'child_cat': 'int8', 'year': 'int16', 'month': 'int8',
 'total_days': 'int8', 'working_days': 'int8', 'weekend_days': 'int8',
 'price_change': 'float32', 'price_perc': 'float32', 'city_id': 'int8',
 'city_code_id': 'int8', 'target': 'float32','city_demand': 'float32',
 'shop_demand': 'float32', 'cat_demand': 'float32', 'item_demand': 'float32',
 'lag1_total_sold': 'float32', 'lag1_diff': 'float32', 'lag2_total_sold': 'float32',
 'lag2_diff': 'float32', 'lag3_total_sold': 'float32', 'lag3_diff': 'float32',
 'lag6_total_sold': 'float32','lag6_diff': 'float32'}

sales = pd.read_csv(DATA_DIR+"features.zip",
        dtype=col_types, usecols=col_types.keys()).sample(frac=.25)
sales["price_perc"] = sales.price_perc.fillna(1)
sales.head(3)

Unnamed: 0,shop_id,item_id,date_block_num,total_sold,sum_revenue,n_transactions,item_category_id,item_sim_id,item_name_len,parent_cat,...,cat_demand,item_demand,lag1_total_sold,lag1_diff,lag2_total_sold,lag2_diff,lag3_total_sold,lag3_diff,lag6_total_sold,lag6_diff
10364384,50,19391,25,0.0,0.0,0,31,12944,45,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7290734,53,16479,4,0.0,0.0,0,55,10974,38,13,...,0.03131685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13507203,15,18309,18,0.0,0.0,0,55,12188,42,13,...,2.524355e-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# filters
b_ind = (sales.year==2015) & (sales.month==9) # 2nd stage
#c_ind = (sales.year==2015) & (sales.month==9) # eval
test_ind = (sales.year==2015) & (sales.month==10) # prediction
a_ind = ~b_ind & ~test_ind # 1st stage

target = "target"
features = [c not in [target] for c in sales.columns]
cat_cols = ["shop_id", "item_id", "item_sim_id", "parent_cat",
    "child_cat", "year", "month", "city_id", "city_code_id"]
num_cols = [f for f in sales.columns[features] if (f not in cat_cols) & (f not in [target])]

# target clip
sales.loc[:,target] = sales.loc[:,target].clip(0,20)

## First stage models

In [9]:
# base enr pipe
enr_model = Pipeline(steps=[("ct",ColumnTransformer(transformers=[("se", StandardScaler(), num_cols),
    ("te", TargetEncoder(cols=cat_cols), cat_cols)])),
    ("qt", QuantileTransformer()),
    ("pca", IncrementalPCA()),
    ("enr", ElasticNet())])
# try randomized search
params = {"ct__te__min_samples_leaf":randint(7.5*10**3,7.5*10**4),
    "ct__te__smoothing":[1.0,2.0,5.0,50.0,100.0,200.0,500.0],
    "qt__n_quantiles":randint(10**2,10**3),"pca__n_components":randint(5,20),
    "enr__alpha":uniform(0.0001,.9999), "enr__l1_ratio":uniform(0.1,.9)}
enr_hyperopt = RandomizedSearchCV(enr_model, param_distributions=params,
    scoring="neg_mean_squared_error", n_iter=10, n_jobs=4, cv=4, random_state=1)
enr_hyperopt.fit(sales.loc[a_ind,features], sales.loc[a_ind,target])
# pushout preds
enr_b_feature = enr_hyperopt.predict(sales.loc[b_ind,features])
#enr_c_feature = enr_hyperopt.predict(sales.loc[c_ind,features])
enr_test_feature = enr_hyperopt.predict(sales.loc[test_ind,features])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

In [10]:
# base mlp pipe
mlp_model = Pipeline(steps=[("ct",ColumnTransformer(transformers=[("se", StandardScaler(), num_cols),
    ("te", TargetEncoder(cols=cat_cols), cat_cols)])),
    ("qt", QuantileTransformer()),
    #("pca", IncrementalPCA(n_components=25)),
    ("mlp", MLPRegressor(early_stopping=True))])

# try randomized search
params = {"ct__te__min_samples_leaf":randint(7.5*10**3,7.5*10**5),
    "ct__te__smoothing":[1.0,2.0,5.0,50.0,100.0,200.0,500.0],
    "qt__n_quantiles":randint(10**3,10**5),
    "mlp__hidden_layer_sizes":[(32),(64,),(128,), (32,8),(64,16),(128,32)],
    "mlp__alpha":10**np.linspace(-3,3,10), "mlp__max_iter":[100,250,500]}
mlp_hyperopt = RandomizedSearchCV(mlp_model, param_distributions=params,
    scoring="neg_mean_squared_error", n_iter=10, n_jobs=4, cv=4, random_state=1)
mlp_hyperopt.fit(sales.loc[a_ind,features], sales.loc[a_ind,target])
# pushout preds
mlp_b_feature = mlp_hyperopt.predict(sales.loc[b_ind,features])
#mlp_c_feature = mlp_hyperopt.predict(sales.loc[c_ind,features])
mlp_test_feature = mlp_hyperopt.predict(sales.loc[test_ind,features])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

In [14]:
# lightgbm
params = {"learning_rate":[0.0001, 0.001, 0.01, 0.1],"max_depth":randint(1,50),
    "n_estimators":[50,100,150,200,500,750,1000], "subsample":uniform(0.1,.8), "num_leaves":randint(10,100)}
lgbm_hyperopt = RandomizedSearchCV(LGBMRegressor(), param_distributions=params,
    scoring="neg_mean_squared_error", n_iter=10, n_jobs=1, cv=4, random_state=1)
lgbm_hyperopt.fit(sales.loc[a_ind,features], sales.loc[a_ind,target])
# pushout preds
lgbm_b_feature = lgbm_hyperopt.predict(sales.loc[b_ind,features])
#lgbm_c_feature = lgbm_hyperopt.predict(sales.loc[c_ind,features])
lgbm_test_feature = lgbm_hyperopt.predict(sales.loc[test_ind,features])

In [67]:
lgbm_hyperopt.best_params_

{'learning_rate': 0.2,
 'max_depth': 8,
 'n_estimators': 100,
 'num_leaves': 16,
 'subsample': 0.8484312566448254}

In [15]:
# xgboost
params = {"eta":[0.0001, 0.001, 0.01, 0.1],"max_depth":randint(1,50),
    "n_estimators":[50,100,150,200,500,750,1000], "subsample":uniform(0.1,.8)}
xgb_hyperopt = RandomizedSearchCV(XGBRegressor(), param_distributions=params,
    scoring="neg_mean_squared_error", n_iter=10, n_jobs=1, cv=4, random_state=1)
xgb_hyperopt.fit(sales.loc[a_ind,features], sales.loc[a_ind,target])
# pushout preds
xgb_b_feature = xgb_hyperopt.predict(sales.loc[b_ind,features])
#xgb_c_feature = xgb_hyperopt.predict(sales.loc[c_ind,features])
xgb_test_feature = xgb_hyperopt.predict(sales.loc[test_ind,features])

# Second stage models

In [17]:
b_features = np.stack([enr_b_feature, mlp_b_feature, lgbm_b_feature], axis=1)
#c_features = np.stack([enr_c_feature, mlp_c_feature, lgbm_c_feature], axis=1)
test_features = np.stack([enr_test_feature, mlp_test_feature, lgbm_test_feature], axis=1)

lr_model = LinearRegression()
lr_model.fit(b_features, sales.loc[b_ind,"target"])

train_rsme = np.mean((lr_model.predict(b_features).clip(0,20)-sales.loc[b_ind,"target"])**2)**(1/2)
val_rsme = 0 #val_rsme = np.mean((lr_model.predict(c_features).clip(0,20)-sales.loc[c_ind,"target"])**2)**(1/2)
print("Stacked pipeline> RMSE on train set: {}; on val set: {}".format(train_rsme, val_rsme))

Stacked pipeline> RMSE on train set: 3.094548671883897; on val set: 2.10791487359515


In [24]:
pd.DataFrame(test_features).mean(axis=1).describe()

count    42906.000000
mean         0.092307
std          0.279840
min          0.006881
25%          0.017073
50%          0.033405
75%          0.063045
max         12.038160
dtype: float64

# Submission

In [69]:
# train using one model
LGBMRegressor(n_estimators=1000, learning_rate=0.01, subsample=.5, num_leaves=20).\
    fit(sales.loc[sales.date_block_num<33, features], sales.loc[sales.date_block_num<33, target])

LGBMRegressor(learning_rate=0.01, n_estimators=1000, num_leaves=20,
              objective='regression', subsample=0.5)

In [75]:
test_data

Unnamed: 0,item_id,shop_id,date_block_num,total_sold,sum_revenue,n_transactions,item_category_id,item_sim_id,item_name_len,parent_cat,...,cat_demand,item_demand,lag1_total_sold,lag1_diff,lag2_total_sold,lag2_diff,lag3_total_sold,lag3_diff,lag6_total_sold,lag6_diff
0,5037,5,,,,,,,,,...,,,,,,,,,,
1,5320,5,,,,,,,,,...,,,,,,,,,,
2,5233,5,,,,,,,,,...,,,,,,,,,,
3,5232,5,,,,,,,,,...,,,,,,,,,,
4,5268,5,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,18454,45,,,,,,,,,...,,,,,,,,,,
214196,16188,45,,,,,,,,,...,,,,,,,,,,
214197,15757,45,,,,,,,,,...,,,,,,,,,,
214198,19648,45,,,,,,,,,...,,,,,,,,,,


In [80]:
test_mapping = pd.read_csv(DATA_DIR+"test.csv")
test_data = test_mapping[["item_id","shop_id"]].merge(sales.loc[sales.date_block_num==33, features],
    on=["item_id", "shop_id"], how="left")
test_data = pd.concat([sales.loc[sales.date_block_num<33, features], test_data])
meh = test_data.groupby(["shop_id","item_id"]).ffill()

In [73]:
test_data

Unnamed: 0,shop_id,item_id,date_block_num,total_sold,sum_revenue,n_transactions,item_category_id,item_sim_id,item_name_len,parent_cat,...,cat_demand,item_demand,lag1_total_sold,lag1_diff,lag2_total_sold,lag2_diff,lag3_total_sold,lag3_diff,lag6_total_sold,lag6_diff
10364384,50,19391,25.0,0.0,0.0,0.0,31.0,12944.0,45.0,8.0,...,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7290734,53,16479,4.0,0.0,0.0,0.0,55.0,10974.0,38.0,13.0,...,3.131685e-02,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13507203,15,18309,18.0,0.0,0.0,0.0,55.0,12188.0,42.0,13.0,...,2.524355e-29,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
764661,2,8717,7.0,0.0,0.0,0.0,40.0,5758.0,11.0,11.0,...,3.469447e-18,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3521453,37,1031,13.0,0.0,0.0,0.0,67.0,680.0,28.0,14.0,...,0.000000e+00,0.253968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,45,18454,,,,,,,,,...,,,,,,,,,,
214196,45,16188,,,,,,,,,...,,,,,,,,,,
214197,45,15757,,,,,,,,,...,,,,,,,,,,
214198,45,19648,,,,,,,,,...,,,,,,,,,,


In [18]:
test_mapping = pd.read_csv(DATA_DIR+"test.csv")
test_predictions = sales.loc[test_ind,["item_id", "shop_id"]]
test_predictions["item_cnt_month"] = lr_model.predict(test_features).clip(0,20)
test_predictions = test_mapping.merge(test_predictions, on=["item_id", "shop_id"],how="left").fillna(0)
test_predictions[["ID", "item_cnt_month"]].to_csv("../data/submissions/submission.csv", index=False)
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f ../data/submissions/submission.csv -m "Stacked solution - 1st try"

401 - Unauthorized


In [66]:
test_mapping.shape

(214200, 3)

In [64]:
(sales.date_block_num==33).sum()

42906

In [55]:
test_predictions.isnull().sum()

ID                     0
shop_id                0
item_id                0
item_cnt_month    204324
dtype: int64