In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
from pathlib import Path

In [3]:
from fastai.vision import *

In [4]:
pd.options.display.max_columns = 50

## Setup the path

In [5]:
path = Path('/kaggle/m5_forecasting/')
assert(path.exists())

In [6]:
def ls(self):
    return list(self.iterdir())
setattr(Path, 'ls', ls)

In [7]:
path.ls()

[PosixPath('/kaggle/m5_forecasting/sales_train_validation.csv'),
 PosixPath('/kaggle/m5_forecasting/walmartTrends0.csv'),
 PosixPath('/kaggle/m5_forecasting/m5_model.lgb'),
 PosixPath('/kaggle/m5_forecasting/calendar.csv'),
 PosixPath('/kaggle/m5_forecasting/sample_submission.csv'),
 PosixPath('/kaggle/m5_forecasting/sell_prices.csv')]

## Read Data

In [8]:
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }
CAL_DTYPES = {"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }

In [9]:
sales_train_validation = pd.read_csv(path/"sales_train_validation.csv", nrows=10)

In [10]:
def read_data():
    prices = pd.read_csv(path/"sell_prices.csv", dtype = PRICE_DTYPES)
    cal = pd.read_csv(path/"calendar.csv", dtype = CAL_DTYPES)
    walmart_trends = pd.read_csv(path/"walmartTrends0.csv")
    return prices, cal, walmart_trends

In [11]:
prices, cal, walmart_trends = read_data()

In [12]:
cal.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0.0,0.0,0.0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0.0,0.0,0.0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0.0,0.0,0.0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1.0,1.0,0.0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1.0,0.0,1.0


#### Pre-process Walmart

In [13]:
walmart_trends.reset_index(inplace=True)
del walmart_trends['index']
walmart_trends.columns

Index(['Week', 'Walmart: (United States)'], dtype='object')

In [14]:
walmart_trends["date"] = pd.to_datetime(walmart_trends["Week"], infer_datetime_format=True)

In [15]:
del walmart_trends['Week']
walmart_trends.columns

Index(['Walmart: (United States)', 'date'], dtype='object')

In [16]:
walmart_trends['date']

0     2010-12-05
1     2010-12-12
2     2010-12-19
3     2010-12-26
4     2011-01-02
         ...    
313   2016-12-04
314   2016-12-11
315   2016-12-18
316   2016-12-25
317   2017-01-01
Name: date, Length: 318, dtype: datetime64[ns]

In [17]:
walmart_trends_expanded = []
for i, d in zip(walmart_trends['Walmart: (United States)'].items(), walmart_trends['date'].items()):
    for days in range(0, 7):
        walmart_trends_expanded.append({'Walmart: (United States)': i[1], 'date': d[1] + timedelta(days=days)})
walmart_trends_df = pd.DataFrame(walmart_trends_expanded)

#### Pre-process calendar

In [18]:
cal["date"] = pd.to_datetime(cal["date"], infer_datetime_format=True)

In [19]:
cal['date'].min(), cal['date'].max(), walmart_trends['date'].min(), walmart_trends['date'].max()

(Timestamp('2011-01-29 00:00:00'),
 Timestamp('2016-06-19 00:00:00'),
 Timestamp('2010-12-05 00:00:00'),
 Timestamp('2017-01-01 00:00:00'))

In [20]:
cal['walmart_google_trends'] = pd.merge(cal, walmart_trends_df, how='left', left_on='date', right_on='date')['Walmart: (United States)']

In [21]:
cal.head(40)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,walmart_google_trends
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0.0,0.0,0.0,19
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0.0,0.0,0.0,20
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0.0,0.0,0.0,20
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1.0,1.0,0.0,20
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1.0,0.0,1.0,20
5,2011-02-03,11101,Thursday,6,2,2011,d_6,,,,,1.0,1.0,1.0,20
6,2011-02-04,11101,Friday,7,2,2011,d_7,,,,,1.0,0.0,0.0,20
7,2011-02-05,11102,Saturday,1,2,2011,d_8,,,,,1.0,1.0,1.0,20
8,2011-02-06,11102,Sunday,2,2,2011,d_9,SuperBowl,Sporting,,,1.0,1.0,1.0,20
9,2011-02-07,11102,Monday,3,2,2011,d_10,,,,,1.0,1.0,0.0,20


In [22]:
def numericalize(df, type_map):
    for col, col_dtype in type_map.items():
        if col_dtype == "category":
            df[col] = df[col].cat.codes.astype('int16')
            df[col] -= df[col].min()

In [23]:
numericalize(prices, PRICE_DTYPES)
numericalize(cal, CAL_DTYPES)

In [24]:
prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,0,0,11325,9.58
1,0,0,11326,9.58
2,0,0,11327,8.26
3,0,0,11328,8.26
4,0,0,11329,8.26


In [25]:
pred_days = 28
max_lags = pred_days * 2 + 1
print('max_lags', max_lags)
num_cols = [c for c in pd.read_csv(path/"sales_train_validation.csv", nrows=2).columns if c.find('d_') == 0]
tr_last = len(num_cols)
catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
# For more training data use a lower value
FIRST_DAY=1

max_lags 57


In [26]:
def read_dt(is_train = True, nrows = None, first_day = 1200):
    start_day = max(1 if is_train else tr_last - max_lags, first_day)
    print('start_day', start_day)
    dtype = {num: 'float32' for num in num_cols}
    dtype.update({cat: 'category' for cat in catcols if cat != 'id'})
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    dt = pd.read_csv(path/"sales_train_validation.csv", nrows=nrows, usecols = catcols + numcols, dtype=dtype)
    for col in catcols:
        if col != 'id':
            dt[col] = dt[col].cat.codes.astype('int16')
            dt[col] -= dt[col].min()
    if not is_train:
        for day in range(tr_last + 1, tr_last + 1 + pred_days):
            dt[f'd_{day}'] = np.nan
            
    dt = dt.melt(id_vars=catcols, value_vars=[col for col in dt.columns if col.startswith("d_")], var_name='d', value_name='sales')
    dt = dt.merge(cal, on='d', copy=False)
    dt = dt.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'], copy=False)
    return dt

In [27]:
%%time

dt = read_dt(first_day=FIRST_DAY)

start_day 1
CPU times: user 15.2 s, sys: 2.66 s, total: 17.9 s
Wall time: 17.9 s


In [28]:
dt.shape

(46027957, 23)

In [29]:
dt

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,walmart_google_trends,sell_price
0,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_1,12.0,2011-01-29,11101,2,1,1,2011,0,0,0,0,0.0,0.0,0.0,19,0.46
1,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_2,15.0,2011-01-30,11101,3,2,1,2011,0,0,0,0,0.0,0.0,0.0,20,0.46
2,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_3,0.0,2011-01-31,11101,1,3,1,2011,0,0,0,0,0.0,0.0,0.0,20,0.46
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_4,0.0,2011-02-01,11101,5,4,2,2011,0,0,0,0,1.0,1.0,0.0,20,0.46
4,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_5,0.0,2011-02-02,11101,6,5,2,2011,0,0,0,0,1.0,0.0,1.0,20,0.46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46027952,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,25,3.98
46027953,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1912,1.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,25,1.28
46027954,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1913,3.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,25,1.28
46027955,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1912,0.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,25,1.00


In [30]:
dt.date.min(), dt.date.max()

(Timestamp('2011-01-29 00:00:00'), Timestamp('2016-04-24 00:00:00'))

## Create features

In [31]:
def create_features(dt):
    lags = [7, 28]
    lag_cols = [f'sales_lag_{lag}' for lag in lags]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[['id', 'sales']].groupby('id')['sales'].shift(lag)
        
    for win in lags:
        for lag, lag_col in zip(lags, lag_cols):
            grouped = dt[['id', lag_col]].groupby('id')[lag_col]
            dt[f'rmean_{lag}_{win}'] = grouped.transform(lambda x : x.rolling(win).mean())

In [32]:
%%time
create_features(dt)

CPU times: user 1min 36s, sys: 6.62 s, total: 1min 43s
Wall time: 1min 43s


In [33]:
# attr = ['Dayofyear',
#             'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
attr = []
date_features = {
    "wday": "weekday",
    "week": "weekofyear",
    "month": "month",
    "quarter": "quarter",
    "year": "year",
    "mday": "day"
}

for f in attr:
    date_features[f.lower()] = f.lower()
    
date_features

{'wday': 'weekday',
 'week': 'weekofyear',
 'month': 'month',
 'quarter': 'quarter',
 'year': 'year',
 'mday': 'day'}

In [34]:
def prepare_date_cols(dt):
    for date_feature_name, date_feature_func in date_features.items():
        if date_feature_name in dt.columns:
            dt[date_feature_name] = dt[date_feature_name].astype('int16')
        else:
            dt[date_feature_name] = getattr(dt['date'].dt, date_feature_func).astype('int16')

In [35]:
%%time
prepare_date_cols(dt)

CPU times: user 7.87 s, sys: 199 ms, total: 8.07 s
Wall time: 8.07 s


In [36]:
dt.dropna(inplace = True)

In [37]:
dt

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,walmart_google_trends,sell_price,sales_lag_7,sales_lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
617364,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,19,0.42,0.0,0.0,2.000000,4.000000,1.642857,4.535714,12,1,25
617371,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,19,1.77,0.0,0.0,2.000000,2.000000,2.142857,2.178571,12,1,25
617378,HOBBIES_1_010_CA_1_validation,9,0,0,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,19,3.17,0.0,0.0,0.000000,0.142857,0.035714,0.214286,12,1,25
617385,HOBBIES_1_012_CA_1_validation,11,0,0,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,19,6.27,1.0,1.0,0.428571,0.857143,0.535714,0.678571,12,1,25
617392,HOBBIES_1_015_CA_1_validation,14,0,0,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,19,0.72,2.0,1.0,5.428571,8.000000,6.142857,4.321429,12,1,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46027952,FOODS_3_825_WI_3_validation,3046,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,25,3.98,0.0,1.0,1.000000,0.714286,0.928571,1.250000,16,2,24
46027953,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1912,1.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,25,1.28,0.0,2.0,0.857143,1.142857,1.035714,1.107143,16,2,23
46027954,FOODS_3_826_WI_3_validation,3047,6,9,2,2,d_1913,3.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,25,1.28,1.0,4.0,0.714286,1.571429,1.035714,1.250000,16,2,24
46027955,FOODS_3_827_WI_3_validation,3048,6,9,2,2,d_1912,0.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,25,1.00,0.0,0.0,0.000000,2.285714,1.821429,1.785714,16,2,23


In [38]:
dt[dt.date == datetime(2014, 1, 1)].head(30)

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,walmart_google_trends,sell_price,sales_lag_7,sales_lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
21099166,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,d_1069,1.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,8.26,0.0,1.0,0.714286,1.0,0.857143,0.714286,1,1,1
21099173,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,3.97,0.0,0.0,0.285714,0.0,0.357143,0.071429,1,1,1
21099180,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_1069,2.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,4.64,0.0,1.0,1.857143,3.142857,2.464286,2.107143,1,1,1
21099187,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_1069,1.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,3.08,0.0,1.0,1.428571,1.0,1.071429,0.75,1,1,1
21099194,HOBBIES_1_006_CA_1_validation,5,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,1.0,0.0,1.0,0.285714,0.714286,0.607143,1.035714,1,1,1
21099201,HOBBIES_1_007_CA_1_validation,6,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,7.88,0.0,0.0,0.714286,0.142857,0.392857,0.392857,1,1,1
21099208,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_1069,3.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,0.46,0.0,11.0,5.428571,5.714286,6.214286,8.25,1,1,1
21099215,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,1.77,0.0,1.0,1.571429,1.285714,1.428571,1.464286,1,1,1
21099222,HOBBIES_1_010_CA_1_validation,9,0,0,0,0,d_1069,3.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,2.97,0.0,2.0,0.571429,1.142857,0.714286,0.785714,1,1,1
21099229,HOBBIES_1_011_CA_1_validation,10,0,0,0,0,d_1069,0.0,2014-01-01,11349,6,5,1,2014,19,2,0,0,1.0,1.0,0.0,32,3.48,0.0,0.0,0.0,0.0,0.107143,0.107143,1,1,1


## Training Preparation

In [39]:
dt.columns

Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'walmart_google_trends', 'sell_price',
       'sales_lag_7', 'sales_lag_28', 'rmean_7_7', 'rmean_28_7', 'rmean_7_28',
       'rmean_28_28', 'week', 'quarter', 'mday'],
      dtype='object')

In [40]:
cat_feats = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'event_name_1', 
             'event_type_1', 'event_name_2', 'event_type_2']
ignore_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday']
train_cols = [c for c in dt.columns if c not in ignore_cols]
X = dt[train_cols]
Y = dt['sales']

In [41]:
valid_size = int(X.shape[0] * 0.1)
np.random.seed(777)

valid_idx = np.random.choice(X.index.values, valid_size, replace=False)
train_idx = np.setdiff1d(X.index.values, valid_idx)
assert valid_idx.size + train_idx.size == X.shape[0]

In [42]:
train_data = lgb.Dataset(X.loc[train_idx], Y.loc[train_idx], categorical_feature=cat_feats, free_raw_data=False)
valid_data = lgb.Dataset(X.loc[valid_idx], Y.loc[valid_idx], categorical_feature=cat_feats, free_raw_data=False)

In [43]:
del dt, X, Y, valid_idx, train_idx
gc.collect()

120

In [44]:
del walmart_trends
gc.collect()

20

## Training

In [45]:
lgb_params = {
        "objective" : "poisson",
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
        'verbosity': 1,
        'num_leaves': 128,
        "min_data_in_leaf": 100,
}

In [46]:
%%time
m_lgb = lgb.train(lgb_params, train_data, valid_sets=[valid_data], verbose_eval=20, categorical_feature=cat_feats, num_boost_round=1200)

[20]	valid_0's rmse: 3.19667
[40]	valid_0's rmse: 2.64326
[60]	valid_0's rmse: 2.48965
[80]	valid_0's rmse: 2.44402
[100]	valid_0's rmse: 2.43129
[120]	valid_0's rmse: 2.42241
[140]	valid_0's rmse: 2.41537
[160]	valid_0's rmse: 2.40767
[180]	valid_0's rmse: 2.40033
[200]	valid_0's rmse: 2.39333
[220]	valid_0's rmse: 2.38787
[240]	valid_0's rmse: 2.38269
[260]	valid_0's rmse: 2.37807
[280]	valid_0's rmse: 2.37291
[300]	valid_0's rmse: 2.36928
[320]	valid_0's rmse: 2.36509
[340]	valid_0's rmse: 2.36182
[360]	valid_0's rmse: 2.35768
[380]	valid_0's rmse: 2.35381
[400]	valid_0's rmse: 2.35122
[420]	valid_0's rmse: 2.34799
[440]	valid_0's rmse: 2.34483
[460]	valid_0's rmse: 2.3428
[480]	valid_0's rmse: 2.34039
[500]	valid_0's rmse: 2.33791
[520]	valid_0's rmse: 2.33636
[540]	valid_0's rmse: 2.33455
[560]	valid_0's rmse: 2.33232
[580]	valid_0's rmse: 2.33013
[600]	valid_0's rmse: 2.32865
[620]	valid_0's rmse: 2.32654
[640]	valid_0's rmse: 2.32444
[660]	valid_0's rmse: 2.32291
[680]	valid_0's

In [47]:
str(path/'m5_model.lgb')

'/kaggle/m5_forecasting/m5_model.lgb'

In [48]:
m_lgb.save_model(str(path/'m5_model.lgb'))

<lightgbm.basic.Booster at 0x7fb1f7769810>

In [49]:
m_lgb = lgb.Booster(model_file=str(path/'m5_model.lgb'))

## Prediction

In [50]:
from tqdm.notebook import tqdm

In [51]:
alphas = [1.028, 1.023, 1.018]
weights = [1 / len(alphas)] * len(alphas)
assert sum(weights) == 1.0
fday = datetime(2016, 4, 25) 
assert datetime(2011, 1, 29) + timedelta(days=1914 - 1) == fday

In [52]:
cols = [f'F{i}' for i in range(1, pred_days + 1)]
sub = pd.DataFrame()
te = read_dt(False)

for icount, (alpha, weight) in tqdm(enumerate(zip(alphas, weights)), total=len(alphas)):
    for tdelta in tqdm(range(0, pred_days), total=pred_days):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_features(tst)
        prepare_date_cols(tst)
        tst = tst.loc[tst.date == day, train_cols]
        te.loc[te.date == day, 'sales'] = alpha * m_lgb.predict(tst) # magic multiplier by kyakovlev
    
    te_sub = te.loc[te.date >= fday, ['id', 'sales']].copy()
    te_sub['F'] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()['sales'][cols]
    te_sub.fillna(0., inplace=True)
    te_sub.sort_values(["id"], inplace=True)
    te_sub.reset_index(drop=False, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    
    if icount == 0:
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols] * weight

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission.csv",index=False)

start_day 1856


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00



HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00



HBox(children=(FloatProgress(value=0.0, max=28.0), HTML(value='')))

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00




In [53]:
!wc -l submission.csv

60981 submission.csv


In [54]:
pd.read_csv('submission.csv')

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.941728,0.882507,0.880217,0.831535,1.137847,1.199630,1.199561,0.998097,0.970100,0.975499,1.002671,1.130980,1.314221,1.123562,0.957760,0.887815,0.916123,0.926501,1.079663,1.332937,1.235079,0.945463,0.849102,0.836711,0.851794,1.077530,1.271474,1.228209
1,FOODS_1_001_CA_2_validation,1.116155,1.121406,1.005362,1.243528,1.342176,1.536274,1.441502,1.002968,1.016579,0.995612,1.037407,1.257823,1.593927,1.308953,1.145986,1.113787,1.108406,1.104808,1.338410,1.855680,1.551243,1.132974,1.083434,1.084167,1.100001,1.327303,1.821015,1.589146
2,FOODS_1_001_CA_3_validation,1.124378,1.098403,1.033801,0.979253,1.052553,1.114425,1.229024,1.101787,1.100687,1.011511,1.040616,1.157005,1.384097,1.179016,1.091324,1.078977,1.046443,1.058235,1.149064,1.577763,1.620408,1.090194,1.030493,0.971517,0.976809,1.094838,1.336345,1.241150
3,FOODS_1_001_CA_4_validation,0.417476,0.384001,0.379374,0.375521,0.462624,0.456527,0.498086,0.426118,0.442529,0.432075,0.430104,0.423184,0.443226,0.405695,0.377932,0.383694,0.404324,0.434475,0.477263,0.516618,0.517843,0.384703,0.372949,0.373473,0.400301,0.442286,0.496618,0.491440
4,FOODS_1_001_TX_1_validation,0.177985,0.176080,0.171513,0.175862,0.165915,0.180612,0.239928,0.549013,0.492527,0.502944,0.548088,0.575170,0.584624,0.476078,0.407418,0.585339,0.367276,0.387255,0.392921,0.401339,0.444036,0.362910,0.329429,0.333908,0.295633,0.336040,0.384940,0.390658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,HOUSEHOLD_2_516_TX_2_evaluation,0.259264,0.239845,0.260041,0.258373,0.317446,0.402302,0.378582,0.244213,0.247966,0.241316,0.263087,0.315754,0.353075,0.313281,0.249224,0.243222,0.245425,0.253862,0.289157,0.373888,0.367515,0.251593,0.238225,0.248835,0.243609,0.287467,0.358562,0.334102
60976,HOUSEHOLD_2_516_TX_3_evaluation,0.172544,0.158776,0.162784,0.158033,0.180024,0.216313,0.172027,0.125376,0.117394,0.107958,0.122272,0.131778,0.168704,0.144569,0.120562,0.122175,0.125228,0.129032,0.148662,0.179969,0.171849,0.143287,0.136366,0.134294,0.136923,0.155786,0.183672,0.175266
60977,HOUSEHOLD_2_516_WI_1_evaluation,0.095257,0.093487,0.093791,0.094461,0.098556,0.114941,0.102419,0.095982,0.096307,0.092880,0.095864,0.117852,0.148881,0.117244,0.097578,0.093855,0.096105,0.098321,0.121193,0.141919,0.134117,0.096795,0.093447,0.094031,0.096203,0.123225,0.146554,0.136965
60978,HOUSEHOLD_2_516_WI_2_evaluation,0.044743,0.043204,0.042494,0.093667,0.109824,0.115904,0.101831,0.090910,0.090641,0.086600,0.114964,0.131340,0.134598,0.115511,0.109735,0.109685,0.107787,0.112179,0.138464,0.141316,0.123347,0.114520,0.107285,0.107206,0.086273,0.102099,0.110596,0.100532
