In [2]:
import numpy as np
import pandas as pd

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Loading Data

In [4]:
data_dir = "./data/"
calendar = pd.read_csv(data_dir+'calendar.csv')
calendar = reduce_mem_usage(calendar)
print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))

sell_prices = pd.read_csv(data_dir+'sell_prices.csv')
sell_prices = reduce_mem_usage(sell_prices)
print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))

sales = pd.read_csv(data_dir+'sales_train_validation.csv')
print('Sales train validation has {} rows and {} columns'.format(sales.shape[0], sales.shape[1]))

Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns
Sales train validation has 30490 rows and 1919 columns


In [5]:
idCols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
product = sales[idCols].drop_duplicates()

In [6]:
import warnings
warnings.filterwarnings('ignore')

submission = pd.read_csv(data_dir+'sample_submission.csv')
validate_submission = submission[submission.id.str.endswith('validation')]
eval_submission = submission[submission.id.str.endswith('evaluation')]

# change column name
newcolumns = ["id"] + ["d_{}".format(i) for i in range(1914, 1914+28)]
validate_submission.columns = newcolumns
validate_submission = validate_submission.merge(product, how = 'left', on = 'id')

newcolumns = ["id"] + ["d_{}".format(i) for i in range(1942, 1942+28)]
eval_submission.columns = newcolumns
eval_submission['id'] = eval_submission['id'].str.replace('_evaluation','_validation')
eval_submission = eval_submission.merge(product, how = 'left', on = 'id')

# Select Subset of Sales

In [7]:
idCols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

# Use only one year of data
DAYS = 365; LAST_DAY=1913
dayCols = ["d_{}".format(i) for i in range(LAST_DAY-DAYS+1, LAST_DAY+1)]
sales = sales[idCols+dayCols]
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1549,d_1550,d_1551,d_1552,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,1,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,3,0,2,1,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,2,1,2,...,2,1,1,0,1,1,2,2,2,4


# Melted & Merge

In [8]:


def melted(df, name=""):
    df = pd.melt(df, id_vars = idCols, var_name = 'day', value_name = 'demand')
    print('{}: {} rows and {} columns'.format(name, df.shape[0], df.shape[1]))
    df = reduce_mem_usage(df)
    # df.to_csv(name+".csv")
    return df

melted_sales = melted(sales)
melted_sales["part"] = "train"
melted_validate = melted(validate_submission)
melted_validate["part"] = "validate"
melted_eval = melted(eval_submission)
melted_eval["part"] = "evaluate"

data = pd.concat([melted_sales, melted_validate, melted_eval], axis = 0)
data.head()

: 11128850 rows and 8 columns
Mem. usage decreased to 615.57 Mb (9.4% reduction)
: 853720 rows and 8 columns
Mem. usage decreased to 46.41 Mb (10.9% reduction)
: 853720 rows and 8 columns
Mem. usage decreased to 46.41 Mb (10.9% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,0,train
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,0,train
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,0,train
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,3,train
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1549,0,train


In [9]:
# merge with calendar, sell_prices

calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
data.drop(['d', 'day'], inplace = True, axis = 1)

# get the sell price data (this feature should be very important)
data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))

Our final dataset to train has 12836290 rows and 18 columns


In [10]:
# data.to_csv("final_dataset.csv")
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2015-04-26,11513,,,,,0,0,0,8.257812
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2015-04-26,11513,,,,,0,0,0,3.970703
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2015-04-26,11513,,,,,0,0,0,2.970703
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,3,train,2015-04-26,11513,,,,,0,0,0,4.640625
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2015-04-26,11513,,,,,0,0,0,2.880859


In [11]:
import gc
gc.collect()

560

# Feature Engineering

In [12]:
# Encode Events Columns
import fasttext

# Use Fasttext to embed event's name and use PCA to reduce word vectors's dimention

# model = fasttext.load_model("models/requirement_text.bin")
# eventNames = data["event_name_1"].unique()
# eventNames = eventNames[1:]
# eventNames

In [13]:
_temp = data.drop(["event_name_1", "event_name_2", "event_type_1", "event_type_2"], axis=1)
_temp.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2015-04-26,11513,0,0,0,8.257812
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2015-04-26,11513,0,0,0,3.970703
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2015-04-26,11513,0,0,0,2.970703
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,3,train,2015-04-26,11513,0,0,0,4.640625
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,train,2015-04-26,11513,0,0,0,2.880859


In [14]:
from sklearn import preprocessing
def encode_categorical(df, cols):
    for col in cols:
        # Leave NaN as it is.
        le = preprocessing.LabelEncoder()
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(le.fit_transform(not_null), index=not_null.index)

    return df


_temp = encode_categorical(_temp, ["id", "cat_id", "dept_id", "item_id", "state_id", "store_id"]).pipe(reduce_mem_usage)

Mem. usage decreased to 501.91 Mb (28.1% reduction)


In [15]:
def datetime_features(df):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    attrs = [
        "year", 
        "quarter", 
        "month", 
        "week", 
        "day", 
        "dayofweek", 
        "is_year_end", 
        "is_year_start", 
        "is_quarter_end", 
        "is_quarter_start", 
        "is_month_end",
        "is_month_start",
    ]

    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        df[attr] = getattr(df['date'].dt, attr).astype(dtype)
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
    
    return df

_temp = datetime_features(_temp)
_temp.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,...,week,day,dayofweek,is_year_end,is_year_start,is_quarter_end,is_quarter_start,is_month_end,is_month_start,is_weekend
0,14370,1437,3,1,0,0,0,train,2015-04-26,11513,...,17,26,6,0,0,0,0,0,0,1
1,14380,1438,3,1,0,0,0,train,2015-04-26,11513,...,17,26,6,0,0,0,0,0,0,1
2,14390,1439,3,1,0,0,0,train,2015-04-26,11513,...,17,26,6,0,0,0,0,0,0,1
3,14400,1440,3,1,0,0,3,train,2015-04-26,11513,...,17,26,6,0,0,0,0,0,0,1
4,14410,1441,3,1,0,0,0,train,2015-04-26,11513,...,17,26,6,0,0,0,0,0,0,1


In [16]:
_temp.sort_values(by=['id', "date"], inplace=True)
X_train = _temp[_temp["part"]=="train"]
X_val = _temp[_temp["part"]=="validate"]
X_eval = _temp[_temp["part"]=="evaluate"]

print(len(X_train), len(X_val), len(X_eval))

11128850 853720 853720


In [17]:
import math

def numerical_feature(df):
    for i in range(1, 8):
        df[f"shifted_t{i}"] = df.groupby('id')["demand"].shift(i)

    for i in range(1, 3):
        df[f"shifted_price_t{i}"] = df.groupby('id')["sell_price"].shift(i)

    df["price_changes"] = df["shifted_price_t2"] - df["shifted_price_t1"]


    for i in [3, 7, 15, 30]:
        df[f"rolling_mean_i{i}"] = df.groupby('id')["demand"].shift(1).rolling(i, min_periods=1).mean()
        df[f"rolling_std_i{i}"] = df.groupby('id')["demand"].shift(1).rolling(i, min_periods=1).std()
        df[f"rolling_skew_i{i}"] = df.groupby('id')["demand"].shift(1).rolling(i, min_periods=1).skew()
        df[f"rolling_kurt_i{i}"] = df.groupby('id')["demand"].shift(1).rolling(i).kurt()
        df[f"rolling_max_i{i}"] = df.groupby('id')["demand"].shift(1).rolling(i, min_periods=1).max()
        df[f"rolling_min_i{i}"] = df.groupby('id')["demand"].shift(1).rolling(i, min_periods=1).min()

        df[f"price_max_t{i}"] = df.groupby('id')["sell_price"].shift(1).rolling(i, min_periods=1).max()

    df["rolling_decay_mean"] = df["shifted_t1"].copy()
    for i in range(2, 8):
        df["rolling_decay_mean"] += math.pow(0.9, i-1) * df[f"shifted_t{i}"]
    df["rolling_decay_mean"] = df["rolling_decay_mean"]/7.0

    weekly_sale = df.groupby(['id', "wm_yr_wk"])["demand"].sum().reset_index()
    weekly_sale["prev_weekly_sale"] = weekly_sale.groupby('id')["demand"].shift(1)
    weekly_sale.drop(["demand"], axis=1, inplace=True)
    df = pd.merge(df, weekly_sale, on=["id", "wm_yr_wk"], how="left")

    df["acc_sale_by_week"] = df.groupby(['id', "wm_yr_wk"])["demand"].cumsum()
    
    return df



## Avoid Memory crash
I built features on a tiny chunk then aggregate together in my local machine

In [17]:
# It's too big to run
# X = numerical_feature(X_train)
# !mkdir feat

In [18]:
# import time

# ids = X_train["id"].unique()


# X = []
# start_time = time.time()
# for idx, rowId in enumerate(ids):
#     if idx%100==1:
#         elapsed_time = time.time() - start_time
#         start_time = time.time()
#         print(idx, elapsed_time)
        
#     tinyX = X_train[X_train["id"]==rowId]
#     _x = numerical_feature(tinyX)
    
#     _x.to_csv(f"feat/numeric_feat{idx}.csv")
#     X.append(_x)
    

# X = pd.concat(X) 
# Y = X["demand"]
# X = X.drop(["demand", "part", "date", "wm_yr_wk"], axis=1)

In [19]:
# !tar cvzf feat.tar.gz ./feat

In [20]:
# # Download what you have done!!
# from IPython.display import FileLink
# FileLink(r'feat.tar.gz')

### Load dataframe from disk

In [3]:
from os import listdir
from os.path import isfile, join

def get_feats(path):
    data = []
    cc = 0
    for f in listdir(path):
        p = join(path, f)
        if isfile(p) and f.endswith('.csv'):
            cc +=1
            d = pd.read_csv(p)
            
            d.drop(["rolling_kurt_i3", "rolling_skew_i3", "rolling_skew_i7", "rolling_kurt_i7", "rolling_skew_i15", "rolling_kurt_i15"], axis=1, inplace=True)
            d.dropna(subset=["sell_price"], inplace=True)
            d.fillna(d.mean(), inplace=True)
            
            data.append(d)

        if cc%1000==0:
            d = pd.concat(data)
            d = reduce_mem_usage(d)
            data = [d]
            print(cc)
            
#             if cc > 5000:
#                 break
#         break
            
    return data

data = get_feats("./feat/")

Mem. usage decreased to 44.16 Mb (75.2% reduction)
1000
Mem. usage decreased to 88.32 Mb (75.2% reduction)
2000
Mem. usage decreased to 132.38 Mb (75.2% reduction)
3000
Mem. usage decreased to 176.56 Mb (75.2% reduction)
4000
Mem. usage decreased to 220.73 Mb (75.2% reduction)
5000
Mem. usage decreased to 264.93 Mb (75.2% reduction)
6000
Mem. usage decreased to 309.10 Mb (75.2% reduction)
7000
Mem. usage decreased to 353.30 Mb (75.2% reduction)
8000
Mem. usage decreased to 397.51 Mb (75.2% reduction)
9000
Mem. usage decreased to 441.71 Mb (75.2% reduction)
10000
Mem. usage decreased to 485.86 Mb (75.2% reduction)
11000
Mem. usage decreased to 530.01 Mb (75.2% reduction)
12000
Mem. usage decreased to 574.15 Mb (75.2% reduction)
13000
Mem. usage decreased to 618.27 Mb (75.2% reduction)
14000
Mem. usage decreased to 662.35 Mb (75.2% reduction)
15000
Mem. usage decreased to 706.20 Mb (75.2% reduction)
16000
Mem. usage decreased to 750.30 Mb (75.2% reduction)
17000
Mem. usage decreased to 7

In [4]:
data = pd.concat(data)
data = reduce_mem_usage(data)

Mem. usage decreased to 1344.01 Mb (75.2% reduction)


In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

nacol = data.isna().sum()*100/len(data)
nacol[nacol>1]
# data.fillna(0, inplace=True)

Series([], dtype: float64)

In [6]:
Y = data["demand"]
X = data.drop(["demand", "part", "date", "wm_yr_wk"], axis=1)
X.head()

del data

# Train Model

In [7]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb


def train(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
    
    params = {
        'boosting_type': 'gbdt',
#         "device_type": "gpu",
        'metric': 'rmse',
        'objective': 'poisson',
        'n_jobs': -1,
        'seed': 20,
        'learning_rate': 0.1,
        'alpha': 0.1,
        'lambda': 0.1,
        'bagging_fraction': 0.66,
        'bagging_freq': 2, 
        'colsample_bytree': 0.77}

    train_set = lgb.Dataset(x_train, y_train)
    test_set = lgb.Dataset(x_test, y_test)

    model = lgb.train(params, train_set, num_boost_round = 3000, early_stopping_rounds = 500, valid_sets = [train_set, test_set], verbose_eval = 100)
    # joblib.dump(model, 'lgbm_0.sav')

    val_pred = model.predict(x_test, num_iteration=model.best_iteration)
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_test))
    return model, val_score

model, rmse = train(X, Y)

Training until validation scores don't improve for 500 rounds.
[100]	training's rmse: 1.77911	valid_1's rmse: 1.81286
[200]	training's rmse: 1.64378	valid_1's rmse: 1.74028
[300]	training's rmse: 1.46168	valid_1's rmse: 1.76294
[400]	training's rmse: 1.29264	valid_1's rmse: 1.7513
[500]	training's rmse: 1.17227	valid_1's rmse: 1.82154
[600]	training's rmse: 1.07968	valid_1's rmse: 1.87629
[700]	training's rmse: 0.980199	valid_1's rmse: 1.71716
[800]	training's rmse: 0.922672	valid_1's rmse: 1.71981
[900]	training's rmse: 0.87044	valid_1's rmse: 1.69372
[1000]	training's rmse: 0.829687	valid_1's rmse: 1.66126
[1100]	training's rmse: 0.789644	valid_1's rmse: 1.63849
[1200]	training's rmse: 0.757094	valid_1's rmse: 1.66
[1300]	training's rmse: 0.725263	valid_1's rmse: 1.67549
[1400]	training's rmse: 0.696191	valid_1's rmse: 1.68802
[1500]	training's rmse: 0.672171	valid_1's rmse: 1.68239
Early stopping, best iteration is:
[1092]	training's rmse: 0.793082	valid_1's rmse: 1.63651


In [20]:
# import pickle 
# dbfile = open('models/LightGBM3.pickle', 'wb') 
# pickle.dump(model, dbfile)
# dbfile.close() #Dont forget this 

In [18]:
import pickle
with open("models/LightGBM3.pickle", 'rb') as fin:
        model = pickle.load(fin)

In [20]:

# Predict function v1; it runs one itemId at a time => so slow
# import time
       
# def predict(model, X_train, X_test):
#     ids = X_train["id"].unique()
#     X_train.sort_values(by=['id', "date"], inplace=True)
    
#     o = []
#     col = ["id"] + ["F{}".format(i) for i in range(1, 29)]
    
#     for i in ids:

#         if i < 15000:
#           continue

#         start_time = time.time()
#         X1 = X_train[X_train["id"]==i].tail(30) #use only the lastest month
        
#         X2 = X_test[X_test["id"]==i]
#         X = X1
        
#         pred = []
#         for index, row in X2.iterrows():
#             X = X.append(row)
#             Xfeat = numerical_feature(X)
#             Xfeat = Xfeat.drop(["demand", "part", "date", "wm_yr_wk"], axis=1)
            
#             x = Xfeat.tail(1).copy()
#             val_pred = model.predict(x, num_iteration=model.best_iteration)
#             pred.append(val_pred[0])
        
#         d = [i]+pred
#         o.append(pd.DataFrame([d], columns=col))
        
#         if i%100==1:
#             elapsed_time = time.time() - start_time
#             start_time = time.time()
#             print(i, elapsed_time)
            
#             pd.concat(o).to_csv(dir+f"predicts/predict_{i}.csv")
#     o = pd.concat(o)
#     return o
    

# pred = predict(model, X_train, X_val)

In [56]:
import time

import datetime
import dateutil.relativedelta

def predict(model, X_train, X_test):
    X_train.sort_values(by=['id', "date"], inplace=True)
    X_test.sort_values(by=['id', "date"], inplace=True)
    
    lastrowdate = X_train.tail(1)["date"]
    lastmonth = lastrowdate - pd.DateOffset(months=1)
    lastmonthstr = str(lastmonth.values[0])
    
    
    X1 = X_train[X_train['date'] >= lastmonthstr] #last month
    ids = X_train["id"].unique()
    
    DATES = X_test["date"].unique()
    NDATE = len(DATES)
    
    col = ["id"] + ["F{}".format(i) for i in range(1, NDATE+1)]
    oarr = np.zeros((len(ids), NDATE+1))
    o = pd.DataFrame(oarr, columns=col)
    
    X = X1
    for idx, date in enumerate(DATES):
        print(idx, date)
        X2 = X_test[X_test["date"]==date]
        
        X = X.append(X2)
        
        print(X.shape)
        X.sort_values(by=['id', "date"], inplace=True)
        
        print("num feats START")
        Xfeat = numerical_feature(X)
        Xfeat.drop(["rolling_kurt_i3", "rolling_skew_i3", "rolling_skew_i7", "rolling_kurt_i7", "rolling_skew_i15", "rolling_kurt_i15"], axis=1, inplace=True)
#         for i in ids:
#             print(i)
#             subx = Xfeat[Xfeat["id"]==i]
#             subx.fillna(subx.mean(), inplace=True)
#             Xfeat[Xfeat["id"]==i] = subx
        
            
        print("num feats DONE")
        
        x = Xfeat[Xfeat["date"]==date]
        x = x.drop(["demand", "part", "date", "wm_yr_wk"], axis=1)

        val_pred = model.predict(x, num_iteration=model.best_iteration)
        
        print(idx, sum(val_pred))
        o[f"F{idx+1}"] = val_pred
        o["id"] = x["id"].values

        lastmonth = lastmonth + pd.DateOffset(days=1)
        X = X[X['date'] >= str(lastmonth.values[0])]
        
    return o
    

pred = predict(model, X_train, X_val)

0 2016-04-25T00:00:00.000000000
(1006170, 27)
num feats START
num feats DONE
0 0.15720656883761752
1 2016-04-26T00:00:00.000000000
(1006170, 66)
num feats START
num feats DONE
1 0.09211935780883498
2 2016-04-27T00:00:00.000000000
(1006170, 66)
num feats START
num feats DONE
2 0.1416950305479053
3 2016-04-28T00:00:00.000000000
(1006170, 66)
num feats START
num feats DONE
3 0.2998779349846289
4 2016-04-29T00:00:00.000000000
(1006170, 66)
num feats START
num feats DONE
4 0.2907232857226404
5 2016-04-30T00:00:00.000000000
(1006170, 66)
num feats START
num feats DONE
5 0.20804799267701007
6 2016-05-01T00:00:00.000000000
(1006170, 66)
num feats START
num feats DONE
6 0.20548802573853917
7 2016-05-02T00:00:00.000000000
(1006170, 66)
num feats START
num feats DONE
7 0.21566393235259237
8 2016-05-03T00:00:00.000000000
(1006170, 66)
num feats START
num feats DONE
8 0.2149132742359337
9 2016-05-04T00:00:00.000000000
(1006170, 66)
num feats START
num feats DONE
9 0.21379038599852487
10 2016-05-05T

In [25]:
pred.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,0,8.828821e-07,1.765517e-06,8.665143e-06,7e-06,7e-06,6e-06,9e-06,8e-06,8e-06,...,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
1,1,2.185887e-05,1.634261e-07,2.562603e-07,1.4e-05,1.3e-05,5e-06,5e-06,8e-06,7e-06,...,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.8e-05,1.7e-05,1.8e-05,1.8e-05
2,2,1.607828e-06,1.358829e-06,7.624862e-06,7e-06,8e-06,7e-06,9e-06,9e-06,8e-06,...,1.7e-05,1.7e-05,1.8e-05,1.7e-05,1.7e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
3,3,4.172301e-06,1.199648e-06,2.052618e-06,1e-05,1e-05,5e-06,5e-06,7e-06,7e-06,...,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
4,4,1.505217e-05,1.505217e-05,1.506388e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,...,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05


In [26]:
le = preprocessing.LabelEncoder()
le.fit_transform(data["id"])

pred["id"] = le.inverse_transform(pred["id"])

In [28]:
pred.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,8.828821e-07,1.765517e-06,8.665143e-06,7e-06,7e-06,6e-06,9e-06,8e-06,8e-06,...,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
1,FOODS_1_001_CA_2_validation,2.185887e-05,1.634261e-07,2.562603e-07,1.4e-05,1.3e-05,5e-06,5e-06,8e-06,7e-06,...,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.8e-05,1.7e-05,1.8e-05,1.8e-05
2,FOODS_1_001_CA_3_validation,1.607828e-06,1.358829e-06,7.624862e-06,7e-06,8e-06,7e-06,9e-06,9e-06,8e-06,...,1.7e-05,1.7e-05,1.8e-05,1.7e-05,1.7e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
3,FOODS_1_001_CA_4_validation,4.172301e-06,1.199648e-06,2.052618e-06,1e-05,1e-05,5e-06,5e-06,7e-06,7e-06,...,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
4,FOODS_1_001_TX_1_validation,1.505217e-05,1.505217e-05,1.506388e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,...,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05


In [29]:
dfeval = submission[submission.id.str.endswith('evaluation')]
df = pd.concat([pred, dfeval]) 
df.to_csv("sample_submission.csv")

In [30]:
df.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,8.828821e-07,1.765517e-06,8.665143e-06,7e-06,7e-06,6e-06,9e-06,8e-06,8e-06,...,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
1,FOODS_1_001_CA_2_validation,2.185887e-05,1.634261e-07,2.562603e-07,1.4e-05,1.3e-05,5e-06,5e-06,8e-06,7e-06,...,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.8e-05,1.7e-05,1.8e-05,1.8e-05
2,FOODS_1_001_CA_3_validation,1.607828e-06,1.358829e-06,7.624862e-06,7e-06,8e-06,7e-06,9e-06,9e-06,8e-06,...,1.7e-05,1.7e-05,1.8e-05,1.7e-05,1.7e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
3,FOODS_1_001_CA_4_validation,4.172301e-06,1.199648e-06,2.052618e-06,1e-05,1e-05,5e-06,5e-06,7e-06,7e-06,...,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.7e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05
4,FOODS_1_001_TX_1_validation,1.505217e-05,1.505217e-05,1.506388e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,...,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05


In [31]:
assert df.shape==submission.shape

# Internal Evaluation

My team decided to use the last 10 days of the training data as the internal test

In [32]:

intX_train = X_train[X_train['date'] < '2016-04-15']
intX_val = X_train[X_train['date'] >= '2016-04-15']
pred = predict(model, intX_train, intX_val)
pred["id"] = le.inverse_transform(pred["id"])

pred.to_csv("internal_submission.csv")

0 2016-04-15T00:00:00.000000000
1 2016-04-16T00:00:00.000000000
2 2016-04-17T00:00:00.000000000
3 2016-04-18T00:00:00.000000000
4 2016-04-19T00:00:00.000000000
5 2016-04-20T00:00:00.000000000
6 2016-04-21T00:00:00.000000000
7 2016-04-22T00:00:00.000000000
8 2016-04-23T00:00:00.000000000
9 2016-04-24T00:00:00.000000000
