In [1]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [2]:
df_train = pd.read_csv(
    'inputs/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "inputs/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "inputs/items.csv",
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

In [3]:
df_stores = pd.read_csv('inputs/stores.csv')
### categorical features
store_nbr_list = []
for i in df_2017.index.values[:].tolist():
    store_nbr_list.append(i[0])

store_nbr_df = pd.DataFrame(store_nbr_list, columns = ['store_nbr'])
cat_vars = store_nbr_df.merge(df_stores, how='left', left_on='store_nbr', right_on = 'store_nbr'
                            )[['store_nbr', 'type', 'city', 'state', 'cluster']] 

cleanup_type = {"type": {"A": 1, "B": 2, "C": 3, "D": 4, "E":5},
               "city": {"Quito": 1, "Guayaquil": 2, "Cuenca": 3, "Santo Domingo": 4, "Manta": 5, "Ambato": 6,
                       "Machala": 7, "Latacunga": 8, "Daule": 9, "Loja": 10, "Salinas": 11, "Esmeraldas": 12, 
                       "Cayambe": 13, "Libertad": 14, "Babahoyo": 15, "Puyo": 16, "Ibarra": 17, "Quevedo": 18,
                       "Guaranda": 19, "Playas": 20, "Riobamba": 21, "El Carmen": 22},
               "state": {"Pichincha": 1, "Guayas": 2, "Azuay": 3, "Manabi": 4, "Santo Domingo de los Tsachilas": 5,
                        "Tungurahua": 6, "El Oro": 7, "Los Rios": 8, "Cotopaxi": 9, "Loja": 10, "Santa Elena": 11,
                        "Esmeraldas": 12, "Pastaza": 13, "Imbabura": 14, "Bolivar": 15, "Chimborazo": 16}}
cat_vars.replace(cleanup_type, inplace=True)



In [4]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    ### original feature list 
    X = pd.DataFrame({
        "mean_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "std_7_2017": get_timespan(df_2017, t2017, 7, 7).std(axis=1).values,
        "std_14_2017": get_timespan(df_2017, t2017, 14, 14).std(axis=1).values,
        "std_30_2017": get_timespan(df_2017, t2017, 30, 30).std(axis=1).values,
        "std_60_2017": get_timespan(df_2017, t2017, 60, 60).std(axis=1).values,
        "std_140_2017": get_timespan(df_2017, t2017, 140, 140).std(axis=1).values,    
        #"median_7_2017": get_timespan(df_2017, t2017, 7, 7).median(axis=1).values,
        #"median_14_2017": get_timespan(df_2017, t2017, 14, 14).median(axis=1).values,
        #"median_30_2017": get_timespan(df_2017, t2017, 30, 30).median(axis=1).values,
        #"median_60_2017": get_timespan(df_2017, t2017, 60, 60).median(axis=1).values,
        #"median_140_2017": get_timespan(df_2017, t2017, 140, 140).median(axis=1).values,   
        "sem_7_2017": get_timespan(df_2017, t2017, 7, 7).sem(axis=1).values,
        "sem_14_2017": get_timespan(df_2017, t2017, 14, 14).sem(axis=1).values,
        "sem_30_2017": get_timespan(df_2017, t2017, 30, 30).sem(axis=1).values,
        "sem_60_2017": get_timespan(df_2017, t2017, 60, 60).sem(axis=1).values,
        "sem_140_2017": get_timespan(df_2017, t2017, 140, 140).sem(axis=1).values,   
        "mad_7_2017": get_timespan(df_2017, t2017, 7, 7).mad(axis=1).values,
        "mad_14_2017": get_timespan(df_2017, t2017, 14, 14).mad(axis=1).values,
        "mad_30_2017": get_timespan(df_2017, t2017, 30, 30).mad(axis=1).values,
        "mad_60_2017": get_timespan(df_2017, t2017, 60, 60).mad(axis=1).values,
        "mad_140_2017": get_timespan(df_2017, t2017, 140, 140).mad(axis=1).values,   
        "skew_7_2017": get_timespan(df_2017, t2017, 7, 7).skew(axis=1).values,
        "skew_14_2017": get_timespan(df_2017, t2017, 14, 14).skew(axis=1).values,
        "skew_30_2017": get_timespan(df_2017, t2017, 30, 30).skew(axis=1).values,
        "skew_60_2017": get_timespan(df_2017, t2017, 60, 60).skew(axis=1).values,
        "skew_140_2017": get_timespan(df_2017, t2017, 140, 140).skew(axis=1).values,  
        "kurt_7_2017": get_timespan(df_2017, t2017, 7, 7).kurt(axis=1).values,
        "kurt_14_2017": get_timespan(df_2017, t2017, 14, 14).kurt(axis=1).values,
        "kurt_30_2017": get_timespan(df_2017, t2017, 30, 30).kurt(axis=1).values,
        "kurt_60_2017": get_timespan(df_2017, t2017, 60, 60).kurt(axis=1).values,
        "kurt_140_2017": get_timespan(df_2017, t2017, 140, 140).kurt(axis=1).values,     
        "promo_14_2017": np.log1p(get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values),
        "promo_60_2017": np.log1p(get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values),
        "promo_140_2017": np.log1p(get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values),
        "unpromo_16aftsum_2017":np.log1p((1-get_timespan(promo_2017, t2017+timedelta(16), 16, 16))
                                         .iloc[:,1:].sum(axis=1).values),
    })
    
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
        X['std_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').std(axis=1).values
        X['std_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').std(axis=1).values
        #X['median_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').median(axis=1).values
        #X['median_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').median(axis=1).values
        X['sem_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').sem(axis=1).values
        X['sem_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').sem(axis=1).values
        X['mad_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mad(axis=1).values
        X['mad_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mad(axis=1).values
        X['skew_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').skew(axis=1).values
        X['skew_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').skew(axis=1).values
        X['kurt_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').kurt(axis=1).values
        X['kurt_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').kurt(axis=1).values
    
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
        for j in [14,60,140]:
            X["aft_promo_{}{}".format(i,j)] = (promo_2017[
                t2017 + timedelta(days=i)]-1).values.astype(np.uint8)
            X["aft_promo_{}{}".format(i,j)] = np.log1p(X["aft_promo_{}{}".format(i,j)]*X['promo_{}_2017'.format(j)])
        if i == 15:
            X["bf_unpromo_{}".format(i)]=0
        else:
            X["bf_unpromo_{}".format(i)] = (1-get_timespan(
                    promo_2017, t2017+timedelta(16), 16-i, 16-i)
                                           ).iloc[:,1:].sum(axis=1).values / (15-i) * X['promo_{}'.format(i)]
    
    ## calculate row metrics vars 
    X = pd.concat([X, np.log1p((X == 0).sum(axis=1))], axis=1)  
    X.rename(columns={0: 'num_zeros'}, inplace=True)
    X_stat_vars = (X.transpose()).describe(percentiles = [.01, .05, .25, .5, .75, .95, .99]).transpose()
    X = pd.concat([X, X_stat_vars.iloc[0:,1:]], axis=1)
        
    ## attach categorical vars
    X = pd.concat([X, cat_vars], axis=1)
    
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X


In [7]:
t2017 = date(2017, 5, 31)
delta = timedelta(days=7 * 8)
t2017 + delta

datetime.date(2017, 7, 26)

In [5]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(5):
    delta = timedelta(days=7 * i * 2)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [6]:
#list(X_train)
X_train.shape

(837575, 217)

In [6]:
print("Training and predicting models...")


#MAX_ROUNDS = 300
val_pred = []
test_pred = []
cate_vars = []

for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    model = CatBoostRegressor()
        
    model.fit(X_train, y_train[:, i])
        #cat_features=cate_vars)
    
    val_pred.append(model.predict(X_val))
    test_pred.append(model.predict(X_test))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9
Step 10
Step 11
Step 12
Step 13
Step 14
Step 15
Step 16
Validation mse: 0.35276250964


In [6]:
print("Training and predicting models...")


#MAX_ROUNDS = 1000
val_pred = []
test_pred = []
cate_vars = []

for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    model = CatBoostRegressor(iterations=1000, depth=10)
        
    model.fit(X_train, y_train[:, i])
        #cat_features=cate_vars)
    
    val_pred.append(model.predict(X_val))
    test_pred.append(model.predict(X_test))

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

Training and predicting models...
Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9
Step 10
Step 11
Step 12
Step 13
Step 14
Step 15
Step 16
Validation mse: 0.337119836866


In [7]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('catboost_deep_pars.csv', float_format='%.6f', index=None)

Making submission...
