In LGB approach:

Scaling: Not neeeded since tree based models split each feature separately (splitting from 1-1000 is equal to spliting 1-10)

Outliers: (in target variable) Being a boosted model, it is sensible to outliers. Boosting focus on previous trees errors, since outliers generate a big error the model will focus a lot on that. -> Log the target variable and in the input variables set for example the average of same day past years or past week or something..

Null values -> Manually infer something is the best approach (comvert to unknown to be identified as a category)

Zeros-> Ok

Categorical features-> Tree based models can use its own auto encoder (label or one hot). Best is to do it manually, decision should be made on cardinality of categories (>15). Try target encoding


Try target encoder
Try removing irrelevant features
Add TQDM to see progress

In [1]:
import os
import gc
import warnings

import pandas as pd
from pandas.plotting import register_matplotlib_converters
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV



from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)
# register_matplotlib_converters()
# sns.set()

import logging
formatter = '%(asctime)s  - %(levelname)s - %(message)s'
logging.basicConfig(format=formatter)
logger = logging.getLogger('logger')
logger.setLevel(logging.DEBUG)
logger.info(f'Setup complete')




2020-09-16 16:03:50,187  - INFO - Setup complete


In [2]:
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        logger.info(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [9]:
def read_data():
    INPUT_DIR = f"src/data/"

    logger.info("Reading files...")

    calendar = pd.read_csv(f"{INPUT_DIR}calendar.csv").pipe(reduce_mem_usage)
    prices = pd.read_csv(f"{INPUT_DIR}sell_prices.csv").pipe(reduce_mem_usage)
    sales = pd.read_csv(f"{INPUT_DIR}sales_train_evaluation.csv",).pipe(reduce_mem_usage)
    submission = pd.read_csv(f"{INPUT_DIR}sample_submission.csv").pipe(reduce_mem_usage)

    logger.info(f"sales shape:{sales.shape}")
    logger.info(f"prices shape: {prices.shape}")
    logger.info(f"calendar shape: {calendar.shape}")
    logger.info(f"submission shape: {submission.shape}")

    return sales, prices, calendar, submission

In [10]:
sales, prices, calendar, submission = read_data()

NUM_ITEMS = sales.shape[0]  # 30490
DAYS_PRED = submission.shape[1] - 1  # 28

logger.info(f'There are {NUM_ITEMS} time series in the dataset.')
logger.info(f'We want to predict the next {DAYS_PRED} days.')
logger.info(f'Complete')

2020-09-16 16:08:20,469  - INFO - Reading files...
2020-09-16 16:08:20,489  - INFO - Mem. usage decreased to  0.12 Mb (41.9% reduction)
2020-09-16 16:08:22,419  - INFO - Mem. usage decreased to 143.53 Mb (31.2% reduction)
2020-09-16 16:10:20,648  - INFO - Mem. usage decreased to 95.61 Mb (78.9% reduction)
2020-09-16 16:10:20,810  - INFO - Mem. usage decreased to  2.09 Mb (84.5% reduction)
2020-09-16 16:10:20,810  - INFO - sales shape:(30490, 1947)
2020-09-16 16:10:20,811  - INFO - prices shape: (6841121, 4)
2020-09-16 16:10:20,811  - INFO - calendar shape: (1969, 14)
2020-09-16 16:10:20,812  - INFO - submission shape: (60980, 29)
2020-09-16 16:10:20,813  - INFO - There are 30490 time series in the dataset.
2020-09-16 16:10:20,814  - INFO - We want to predict the next 28 days.
2020-09-16 16:10:20,814  - INFO - Complete


In [11]:
# sendo variaveis categoricas de id usamos um label encoder
def encode_categorical(df, cols):
    for col in cols:
        # Leave NaN as it is.
        le = LabelEncoder()
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(le.fit_transform(not_null), index=not_null.index)

    return df


nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
for feature in nan_features:
    calendar[feature].fillna('unknown', inplace = True)

calendar = encode_categorical(
    calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
).pipe(reduce_mem_usage)

sales = encode_categorical(
    sales, ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
).pipe(reduce_mem_usage)

prices = encode_categorical(
    prices, ["item_id", "store_id"]
).pipe(reduce_mem_usage)

logger.info("Features encoded succecefully...")

2020-09-16 16:10:20,823  - INFO - Features encoded succecefully...


In [12]:
def reshape_sales(sales, submission, start_day=0, verbose=True):

    # segmentar as colunas de id e featires separadamente
    id_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
    evals_columns = ["id"] + [f"d_{d}" for d in range(1942, 1942 + DAYS_PRED)]
    
    # nesta segunda faze do desafio so queremos prever os evaluation
    evals = submission[submission["id"].str.endswith("evaluation")]

    
    # criar uma tabela auxiliar product para fazer o melt (passar de wide a long, contrario do pivot()) e colocar a info no formato certo
    product = sales[id_columns]
    sales = sales.melt(id_vars=id_columns, var_name="d", value_name="demand").pipe(reduce_mem_usage)
    
#     criar o df para fazer os predicts
    evals.columns = evals_columns
    evals = evals.merge(product, how="left", on="id")
    evals = evals.melt(id_vars=id_columns, var_name="d", value_name="demand")

#     segregar os dados para treino e os dados para predict
    sales["part"] = "train"
    evals["part"] = "evaluation"

#     juntar ambos os datasets de treino e de evals para fazer todas as transformacoes, f eng, etc..
    data = pd.concat([sales, evals], axis=0)

    logger.info('Dataframe created')
    del sales, evals

#     tornar a coluna de id do dia em int para separar os datasets
    data["d"] = data["d"].str.slice(2,).astype(np.int16)
    data = data[data["d"] >= start_day]

    gc.collect()

    return data


def merge_calendar(data, calendar):
    calendar = calendar.drop(["weekday", "wday", "month", "year"], axis=1)
    return data.merge(calendar, how="left", on="d")

def merge_prices(data, prices):
    return data.merge(prices, how="left", on=["store_id", "item_id", "wm_yr_wk"])

In [13]:
#consideramos apenas os ultimos 2 anos, se considerarmos todo os dados hist fica muito grande...
data = reshape_sales(sales, submission, start_day =1941 - (2*52*7 + 1)) 
del sales

calendar["d"] = calendar["d"].str.slice(2,).astype(np.int16)
data = merge_calendar(data, calendar)
logger.info(f'Calendar merged.')
del calendar

data = merge_prices(data, prices)
logger.info(f'Prices merged.')
del prices
gc.collect()

data = reduce_mem_usage(data)
# data.head()
# data.tail()

2020-09-16 16:10:31,493  - INFO - Mem. usage decreased to 3273.49 Mb (0.0% reduction)
2020-09-16 16:10:49,697  - INFO - Dataframe created
2020-09-16 16:11:34,390  - INFO - Calendar merged.
2020-09-16 16:11:43,610  - INFO - Prices merged.


14

2020-09-16 16:11:47,698  - INFO - Mem. usage decreased to 2578.77 Mb (0.0% reduction)


In [19]:
data.to_csv('M5.csv')

In [None]:
#feature engineering 

def add_demand_features(df):

#     get the demand of same weekday in past weeks
    for shift in [7, 14, 28, 56, 364]:
        df[f"shift_t{shift}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(shift)
        )
    logger.info("lag features done")
    gc.collect()

#     get some basic statistic values for 'weekly', 'biweekly' and 'mpnthly'in previous month
    diff = 28
    for window in [7, 14, 28]:
        df[f"shift_t{diff}_rolling_std_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(diff).rolling(window).std()
        )
        df[f"shift_t{diff}_rolling_mean_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(diff).rolling(window).mean()
        )
        df[f"rolling_min_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(diff).rolling(window).min()
        )
        df[f"rolling_max_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(diff).rolling(window).max()
        )
        df[f"rolling_sum_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(diff).rolling(window).sum()
        )
        df[f"rolling_skew_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(diff).rolling(window).skew()
        )
        df[f"rolling_kurt_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(diff).rolling(window).kurt()
        )
            
    logger.info("rolling windows features done")
    
    logger.info("expanding windows features done")
           
    return df


def add_price_features(df):
            
    df["shift_price_t1"] = df.groupby(["id"])["sell_price"].transform(
        lambda x: x.shift(1)
    )
    df["price_change_t1"] = (df["shift_price_t1"] - df["sell_price"]) / (
        df["shift_price_t1"]
    )
    df["rolling_price_max_t365"] = df.groupby(["id"])["sell_price"].transform(
        lambda x: x.shift(1).rolling(365).max()
    )
    df["price_change_t365"] = (df["rolling_price_max_t365"] - df["sell_price"]) / (
        df["rolling_price_max_t365"]
    )

    df["rolling_price_std_t7"] = df.groupby(["id"])["sell_price"].transform(
        lambda x: x.rolling(7).std()
    )
    df["rolling_price_std_t30"] = df.groupby(["id"])["sell_price"].transform(
        lambda x: x.rolling(30).std()
    )
    
    df['price_max'] = df.groupby(['store_id','item_id'])['sell_price'].transform('max')
    df['price_min'] = df.groupby(['store_id','item_id'])['sell_price'].transform('min')
    df['price_std'] = df.groupby(['store_id','item_id'])['sell_price'].transform('std')
    df['price_mean'] = df.groupby(['store_id','item_id'])['sell_price'].transform('mean')
    
    # percentage change between the current and a prior element
    df["sell_price_rel_diff"] = df.groupby(["item_id"])["sell_price"].pct_change()

    # rolling std of prices
    df["sell_price_roll_sd7"] = df.groupby(["item_id"])["sell_price"].transform(lambda x: x.rolling(7).std())

    # relative cumulative price 
    grouped = df.groupby(["item_id"])["sell_price"]
    df["sell_price_cumrel"] = (grouped.shift(0) - grouped.cummin()) / (1 + grouped.cummax() - grouped.cummin())

    # Some items are can be inflation dependent
    # and some items are very "stable"
    df['price_nunique'] = df.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
    df['item_nunique'] = df.groupby(['store_id','sell_price'])['item_id'].transform('nunique')
    
    df['price_momentum'] = df['sell_price']/df.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
    df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
    df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')
    
    
    logger.info('Prices done')
    return df


def add_time_features(df, dt_col):
    df['date'] = pd.to_datetime(df['date'])
    attrs = ["year", "quarter", "month", "week", "day", "dayofweek", "is_year_end", "is_year_start", "is_quarter_end", \
        "is_quarter_start", "is_month_end","is_month_start", 'dayofyear', 'weekofyear', 'is_month_end'
    ]

    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        df[attr] = getattr(df['date'].dt, attr).astype(dtype)
            
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)   
    df['has_event'] = (df['event_name_1'] != 'None').astype(int)
    
    ## Adding the embedded vectors (similar to John's, trying both ways)
    df['wd1'] =0
    df['wd2'] =0
    df['wd3'] =0
    df['wd4'] =0

    df.loc[:,'wd1'][df['dayofweek'] =='Sunday'] , df.loc[:,'wd2'][df['dayofweek'] =='Sunday'],\
    df.loc[:,'wd3'][df['dayofweek'] =='Sunday'] , df.loc[:,'wd4'][df['dayofweek'] =='Sunday']= 0.4 ,-0.3 ,0.6,0.1

    df.loc[:,'wd1'][df['dayofweek'] =='Monday'] , df.loc[:,'wd2'][df['dayofweek'] =='Monday'],\
    df.loc[:,'wd3'][df['dayofweek'] =='Monday'] , df.loc[:,'wd4'][df['dayofweek'] =='Monday']= 0.2 ,0.2 ,0.5,-0.3

    df.loc[:,'wd1'][df['dayofweek'] =='Tuesday'] ,df.loc[:,'wd2'][df['dayofweek'] =='Tuesday'],\
    df.loc[:,'wd3'][df['dayofweek'] =='Tuesday'] , df.loc[:,'wd4'][df['dayofweek'] =='Tuesday']= 0.1,-1.0,1.3,0.9

    df.loc[:,'wd1'][df['dayofweek'] =='Wednesday'] , df.loc[:,'wd2'][df['dayofweek'] =='Wednesday'],\
    df.loc[:,'wd3'][df['dayofweek'] =='Wednesday'] , df.loc[:,'wd4'][df['dayofweek'] =='Wednesday']= -0.6,0.5,1.2,0.7

    df.loc[:,'wd1'][df['dayofweek'] =='Thursday'] , df.loc[:,'wd2'][df['dayofweek'] =='Thursday'],\
    df.loc[:,'wd3'][df['dayofweek'] =='Thursday'] , df.loc[:,'wd4'][df['dayofweek'] =='Thursday']= 0.9,0.2,-0.1,0.6

    df.loc[:,'wd1'][df['dayofweek'] =='Friday'] , df.loc[:,'wd2'][df['dayofweek'] =='Friday'],\
    df.loc[:,'wd3'][df['dayofweek'] =='Friday'] , df.loc[:,'wd4'][df['dayofweek'] =='Friday']= 0.4,1.1,0.3,-1.5

    df.loc[:,'wd1'][df['dayofweek'] =='Saturday'] , df.loc[:,'wd2'][df['dayofweek'] =='Saturday'],\
    df.loc[:,'wd3'][df['dayofweek'] =='Saturday'] , df.loc[:,'wd4'][df['dayofweek'] =='Saturday']= 0.3,-0.2,0.6,0.0

    def circle_encode(df, col):

        maxval = float(df[col].max())

        cosval = np.cos(2 * np.pi * df[col]/maxval)
        sinval = np.sin(2 * np.pi * df[col]/maxval)

        return cosval, sinval
    
    
    df['cos_month'], df['sin_month'] = circle_encode(df, 'month')
    df['cos_doy'], df['sin_doy'] = circle_encode(df, 'dayofyear')
    df['cos_woy'], df['sin_woy'] = circle_encode(df, 'weekofyear')
    
    logger.info('Time done')
    
    return df



## weather conditions, special events, "feature discovery"
def add_original_features(df):
    df['shift_t28_log']   = np.log(df['shift_t28'] + 1)
    df['shift_t28_sqrt']  = np.sqrt(df['shift_t28'])
    df['shift_t56_log']   = np.log(df['shift_t56'] + 1)
    df['shift_t56_sqrt']  = np.sqrt(df['shift_t56'])

    df['shift_t28_diff_t7'] = df.groupby('id')['shift_t28'].diff(7)
    df['shift_t56_diff_t7'] = df.groupby('id')['shift_t56'].diff(7)
    
    logger.info('Others done')
    
    return df
    

In [None]:
data = add_demand_features(data).pipe(reduce_mem_usage)
dt_col = "date"
data = add_time_features(data, dt_col).pipe(reduce_mem_usage)
data = data.sort_values("date")
data = add_price_features(data).pipe(reduce_mem_usage)
data = add_original_features(data).pipe(reduce_mem_usage)

In [None]:
drop_cols = ['id', 'date', 'part']
features = [col for col in data.columns if col not in drop_cols]

is_train = (data["d"] < 1914)
is_valid = (data["d"] >= 1914) & (data["d"] < 1942)

is_private = (data["d"] >= 1942)
is_public = ~(data["d"] < 1914) & ~(is_private)

day_col = ['d']

# Attach "d" to X_train for cross validation.
X_train = data[is_train][day_col + features].reset_index(drop=True)
y_train = data[is_train]["demand"].reset_index(drop=True)
X_valid = data[is_valid][day_col + features].reset_index(drop=True)
y_valid = data[is_valid]["demand"].reset_index(drop=True)

# del data
# gc.collect()


X_train = X_train.drop(['d', 'demand'], axis=1)
X_valid = X_valid.drop(['d', 'demand'], axis=1)

# Create template to insert predictions
id_date_pub = data[is_public][["id", "date"]].reset_index(drop=True)
id_date_pri = data[is_private][["id", "date"]].reset_index(drop=True)
X_test_pub = data[is_public][features].reset_index(drop=True)
X_test_pri = data[is_private][features].reset_index(drop=True)

X_test_pub = X_test_pub.drop(['d', 'demand'], axis=1)
X_test_pri = X_test_pri.drop(['d', 'demand'], axis=1)

# print("X_train shape:", X_train.shape)
# print("X_test_pub shape:", X_test_pub.shape)
# print("X_test_pri shape:", X_test_pri.shape)
# print("id_date_pub shape:", id_date_pub.shape)
# print("id_date_pri shape:", id_date_pri.shape)

In [None]:
kkk

Modelling

- Since LGBM uses GOSS (sampling histograms for tree feature splits) it is way faster than xgboost. If there is no outliers it can reach almost the same results in 1/5 the time.

- Exclusive Feature Bundling - We generally work with high dimensionality data. Such data have many features which are mutually exclusive i.e they never take zero values simultaneously. LightGBM safely identifies such features and bundles them into a single feature to reduce the complexity to O(#data * #bundle) 

LGB

In [None]:
# https://neptune.ai/blog/lightgbm-parameters-guide

logger.info('Creating LGB Regressor...')

lgb_regressor = lgb.LGBMRegressor(silent=False)


# gbdt: 500 iter: 2.12, 2.08
# goss: 500 iter: 2.16, 2.09
# dart: 500 iter: 2.24, 2.14

# standard model early stops at 1000 iterations: 2.07, 2.08
# using target variable logged didn't improve



lgb_params = {"max_depth": [25,50, 75],
              "learning_rate" : [0.01,0.05,0.1],
              "num_leaves": [30,40,60],
              "n_estimators": [200], 
              'objective': ['tweedie']
             }


logger.info('Generating grid for GSCV...')

lgb_grid = GridSearchCV(lgb_regressor,
                    lgb_params, 
                    n_jobs=8,
                    scoring="neg_root_mean_squared_error",
                    verbose=5,
                    cv=3)

logger.info('Fitting the grid...')

lgb_grid.fit(X_train, y_train)

print("grid scores")
print (lgb_grid.grid_scores_)
print('best params')
print (lgb_grid.best_params_)
print('best score')
print (lgb_grid.best_score_)


logger.info('HP Tuning complete!')

# del X_train, y_train
# gc.collect()

In [None]:
kkk

In [None]:
train_set = lgb.Dataset(
        X_train,
        label=y_train,
        categorical_feature=["item_id"]
    )

#to use early stopping
valid_set = lgb.Dataset(
        X_valid,
        label=y_valid,
        categorical_feature=["item_id"]
    )

lgb_best_params =  {
#                'lambda_l1': ...,
#                'lambda_l2': ...,
               'num_leaves': 20, 
#                'feature_fraction': ...,
#                'bagging_fraction': ..., 
#                'bagging_freq': ..., 
#                'min_child_samples': ..., 
               'boosting_type': 'gbdt',   # 'dart', 'goss', 'gbdt', 'rf'
               'metric': 'rmse',
               'objective': 'poisson',
               'n_jobs': -1,
               'seed': 42,
#                'learning_rate': ...,
#                'min_data_in_leaf': ...
}

evals_result = {}

logger.info('Training best LGBM model...')

lgb_best_model = lgb.train(
    params = lgb_best_params,
    train_set = train_set,
    valid_sets = [valid_set, train_set],
    valid_names = ['eval', 'train'],
    evals_result = evals_result,
    #fit params
    num_boost_round = 2000,
    early_stopping_rounds = 50,
    verbose_eval = 100,
)
logger.info('Best LGBM model train is complete!')


In [None]:
importances = model.feature_importance()
names = model.feature_name()
fi = pd.DataFrame(importances, names)
# fi.set_index('names')
fi.sort_values(by=0, ascending=False).head(30).plot.barh(figsize=(5,10))

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
# imp_type = "gain"
preds_pub = np.zeros(X_test_pub.shape[0])
preds_pri = np.zeros(X_test_pri.shape[0])

preds_pub = model.predict(X_test_pub)
preds_pri = model.predict(X_test_pri)


val_score = rmse(preds_pub, y_valid)
val_score

XGBoost

In [None]:
D_train = xgb.DMatrix(X_train, label=y_valid)
D_test = xgb.DMatrix(X_valid, label=y_valid)

xgb_regressor = xgb.XGBRegressor()
xgb_params = {
     "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

xgb_grid = GridSearchCV(xgb_regressor,
                    xgb_params, 
                    n_jobs=-1,
                    scoring="rmse",
                    cv=3)

grid.fit(X_train, y_train)

print (grid.grid_scores_)
print('best params')
print (grid.best_params_)
print('best score')
print (grid.best_score_)

In [None]:
best_params =  {
    'booster': 'gbtree',
#     'learning_rate': ,
#     'gamma': ,
#     'lambda': ,
#     'alpha': ,
#     'max_leaves': ,
    'learning_rate': 0.3, 
    'max_depth': 3,  
#     'objective': ,  
    'num_class': 3,
    'eval_metric': 'rmse',
    'seed': 42    
}

logger.info("Creating and fitting model...")

xgb_best_model = xgb.XGBRegressor(
                 verbosity=2
                 colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42,
                 num_round = 200,
                 early_stopping_rounds = 50,
                 verbose_eval = 100) 

xgb_best_model.fit(X_train,y_train)
logger.info("Complete!")


In [None]:
OrderedDict(sorted(model_xgb.booster().get_fscore().items(), key=lambda t: t[1], reverse=True))

CatBooster

In [None]:
params_cb = {'depth': [4, 7, 10],
          'learning_rate' : [0.03, 0.1, 0.15],
         'l2_leaf_reg': [1,4,9],
         'iterations': [300]}
regressor_cb = cb.CatBoostRegressor()
grid_cb = GridSearchCV(regressor_cb, params_cb, scoring="rmse", cv = 3)
grid_cb.fit(X_train, y_train)

In [None]:
def make_submission(test_pub, test_pri, submission):
    preds_pub = test_pub[["id", "date", "demand"]]
    preds_pub['id'] = preds_pub['id'].str.replace("_evaluation", "_validation")
#     print(preds_pub['id'].head())
    preds_pri = test_pri[["id", "date", "demand"]]
    # 01-28: validation
    # 29-56: evaluation
    val_dur  = preds_pub["date"]<"2016-05-23"

    preds_val  = preds_pub[val_dur]
    preds_eval = preds_pri#[eval_dur]

    preds_val = preds_val.pivot(index="id", columns="date", values="demand").reset_index()
    preds_eval = preds_eval.pivot(index="id", columns="date", values="demand").reset_index()
    
    logger.info(f'Predictions validation: {preds_val.shape}')
    logger.info(f'Predictions evaluation: {preds_eval.shape}')
    
    
    preds_val.columns = ["id"] + ["F" + str(d+1) for d in range(28)]
    preds_eval.columns = ["id"] + ["F" + str(d+1) for d in range(28)]
    
    
    preds_val = preds_val[preds_val['id'].str.endswith("validation")]
    preds_eval = preds_eval[preds_eval['id'].str.endswith("evaluation")]
    
    
    vals = submission[submission["id"].str.endswith("validation")]
    vals = submission[["id"]].merge(preds_val, how="inner", on="id")
    
    evals = submission[submission["id"].str.endswith("evaluation")]
    evals = submission[["id"]].merge(preds_eval, how="inner", on="id")
    
    final = pd.concat([vals, evals])

    
    logger.info("Predictions complete.")
    return final

In [None]:
kkk

In [None]:
output = make_submission(id_date_pub.assign(demand=preds_pub), id_date_pri.assign(demand=preds_pri), submission)

In [None]:
output.head()
output.tail()

In [None]:
output.to_csv("submission_private3.csv", index=False)