In [1]:
import numpy as np
import pandas as pd

calendar = pd.read_csv("../data/calendar.csv")
sales = pd.read_csv("../data/sales_train_validation.csv")
# sample_submission = pd.read_csv("../data/sample_submission.csv")
sell_prices = pd.read_csv("../data/sell_prices.csv")

In [2]:
items = sales["item_id"].unique()

In [99]:
filenames = ["expand_df_"+s for s in items]

def get_dataframe(filename):
    df = pd.read_csv("dataframes/{}.csv".format(filename))
    df.sort_values(by=['store_id', "date"], inplace=True)
    df.reset_index(inplace=True)
    return df

In [100]:
import math

def get_label(df):
    return df.groupby('store_id')["sale"].shift(-1)

def get_features(df):
    for i in range(1, 6):
        df[f"shift_t{i}"] = df.groupby('store_id')["sale"].shift(i)
         
    df["rolling_mean"] = df["sale"].shift(1).rolling(3, min_periods=1).mean()
    
    df["rolling_decay_mean"] = df["shift_t1"].copy()
    for i in range(2, 4):
        df["rolling_decay_mean"] += math.pow(0.9, i-1) * df[f"shift_t{i}"]
    df["rolling_decay_mean"] = df["rolling_decay_mean"]/3.0
    
    weekly_sale = df.groupby(['store_id', "wm_yr_wk"])["sale"].sum().reset_index()
    weekly_sale["prev_weekly_sale"] = weekly_sale.groupby('store_id')["sale"].shift(1)
    
    weekly_sale.drop(["sale"], axis=1, inplace=True)
    df = pd.merge(df, weekly_sale, on=["store_id", "wm_yr_wk"], how="left")
    
    df["acc_sale_by_week"] = df.groupby(['store_id', "wm_yr_wk"])["sale"].cumsum()
    
    df["wday"] = df["wday"].astype('category')
    df["month"] = df["month"].astype('category')
    df["year"] = df["year"].astype('category')
    
    df["holiday"] = pd.notna(df["event_name_1"])
    
    pred = get_label(df)
    df["pred"] = pred
    return df


def categorial_encode(X, encoders=None):
    cat = ['store_id', 'state_id']
    if encoders is None:
        encoders = {}
        
    for feature in cat:
        if feature not in encoders:
            encoder = preprocessing.LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature])
        
            encoders[feature] = encoder
        else:
            encoder = encoders[feature]
            X[feature] = encoder.transform(X[feature])
            
    return X, encoders

from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb

def clean_columns(df):
    df = df.drop(["Unnamed: 0", "id", "item_id", "dept_id", "cat_id"], axis=1)
    df = df.drop(["index", "d", "date", "wm_yr_wk", "weekday"], axis=1)
    df = df.drop(["event_name_1", "event_name_2", "event_type_1", "event_type_2"], axis=1)
    return df


In [101]:
def train(x_train, x_test, y_train, y_test):
    params = {
    #     'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'poisson',
        'n_jobs': -1,
        'seed': 20,
        'learning_rate': 0.1,
        'alpha': 0.1,
        'lambda': 0.1,
        'bagging_fraction': 0.66,
        'bagging_freq': 2, 
        'colsample_bytree': 0.77}

    train_set = lgb.Dataset(x_train, y_train)
    test_set = lgb.Dataset(x_test, y_test)

    model = lgb.train(params, train_set, num_boost_round = 2000, early_stopping_rounds = 200, valid_sets = [train_set, test_set], verbose_eval = 100)
    # joblib.dump(model, 'lgbm_0.sav')

    val_pred = model.predict(x_test, num_iteration=model.best_iteration)
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_test))
    return model, val_score

In [102]:
df = get_dataframe(filenames[0])

X = get_features(df)

X = clean_columns(X)
X = X.dropna(subset=['sell_price', "pred"])
Y = X["pred"]
X = X.drop(["pred"], axis=1)

X, encoders = categorial_encode(X)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
x_train.head()

Unnamed: 0,store_id,state_id,sale,wday,month,year,snap_CA,snap_TX,snap_WI,sell_price,shift_t1,shift_t2,shift_t3,shift_t4,shift_t5,rolling_mean,rolling_decay_mean,prev_weekly_sale,acc_sale_by_week,holiday
7178,3,0,0,5,1,2015,1,1,0,8.26,3.0,0.0,0.0,0.0,1.0,1.0,1.0,5.0,3,True
4855,2,0,2,1,11,2013,0,0,0,8.26,0.0,1.0,0.0,1.0,2.0,0.333333,0.3,6.0,2,False
10588,5,1,0,2,11,2013,0,0,0,8.26,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0,False
16767,8,2,2,1,1,2015,0,0,0,8.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2,False
12589,6,1,0,6,2,2014,0,1,0,8.38,1.0,3.0,0.0,1.0,0.0,1.333333,1.233333,3.0,5,False


In [103]:
model, rmse = train(x_train, x_test, y_train, y_test)



Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 0.624462	valid_1's rmse: 0.75093
[200]	training's rmse: 0.569661	valid_1's rmse: 0.75834
Early stopping, best iteration is:
[42]	training's rmse: 0.664861	valid_1's rmse: 0.74692


In [143]:
pd.set_option('display.max_columns', 500)
def submission(df, model, encoders):
    ids = df["id"].unique()
    itemId = df["item_id"].values[0]
    print(itemId)
    s = sell_prices
    s = s[s["item_id"]==itemId]
    
    pred = {}
    for i in ids:
        print(i)
        o = df[df["id"]==i]
        
        pred[i] = []
        storeId = o["store_id"].values[0]
        sellPrices = s[s["store_id"]==storeId]
        
        
        for j in range(28):
            tail = o.tail(10).copy()

            tail = get_features(tail)
            x = tail.tail(1).copy()
            x = clean_columns(x)

            x, _ = categorial_encode(x, encoders)

            val_pred = model.predict(x, num_iteration=model.best_iteration)
            
            pred[i].append(val_pred[0])
            
            nextd = 1914+j
            newrow = o.tail(1).copy()
            
            dkey = "d_{}".format(nextd)
            d = calendar[calendar["d"]==dkey]
            sp = sellPrices[sellPrices["wm_yr_wk"]==d["wm_yr_wk"].values[0]]
            
            if len(sp) > 0:
                sell_price = sp["sell_price"].values[0]
            else:
                sell_price = 0
            
            newrow["sell_price"] = sell_price
            for c in d.columns:
                newrow[c] = d[c].values[0]
#                 print(c, d[c])
            
            o = o.append(newrow, ignore_index=True)
    return pred
    

pred = submission(df, model, encoders)


HOBBIES_1_001
HOBBIES_1_001_CA_1_validation
HOBBIES_1_001_CA_2_validation
HOBBIES_1_001_CA_3_validation
HOBBIES_1_001_CA_4_validation
HOBBIES_1_001_TX_1_validation
HOBBIES_1_001_TX_2_validation
HOBBIES_1_001_TX_3_validation
HOBBIES_1_001_WI_1_validation
HOBBIES_1_001_WI_2_validation
HOBBIES_1_001_WI_3_validation


In [151]:
def save_submission(filename, pred):
    
    arr = []
    for k in pred:
        o = {"id":k}
        for i in range(1, 29):
            o["F"+str(i)] = pred[k][i-1]
        
        arr.append(o)
    d = pd.DataFrame(arr)
    d.to_csv(filename)
save_submission("submission/submission_"+items[0]+".csv", pred)
# filenames[0]


# Run all files

In [160]:
items = sales["item_id"].unique()
filenames = ["expand_df_"+s for s in items]


for idx, item in enumerate(["FOODS_3_278"]):
    
    print(idx+1,"/",len(items),":", item)
    filename = "expand_df_"+item
    df = get_dataframe(filename)
    X = get_features(df)

    X = clean_columns(X)
    X = X.dropna(subset=['sell_price', "pred"])
    Y = X["pred"]
    X = X.drop(["pred"], axis=1)

    X, encoders = categorial_encode(X)

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    
    model, rmse = train(x_train, x_test, y_train, y_test)
    pred = submission(df, model, encoders)
    save_submission("submission/submission_"+item+".csv", pred)
    
print("DONE")

1 / 3049 : FOODS_3_278




Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 1.51187	valid_1's rmse: 1.97442
[200]	training's rmse: 1.23588	valid_1's rmse: 1.9882
Early stopping, best iteration is:
[81]	training's rmse: 1.58478	valid_1's rmse: 1.97199
FOODS_3_278
FOODS_3_278_CA_1_validation
FOODS_3_278_CA_2_validation
FOODS_3_278_CA_3_validation
FOODS_3_278_CA_4_validation
FOODS_3_278_TX_1_validation
FOODS_3_278_TX_2_validation
FOODS_3_278_TX_3_validation
FOODS_3_278_WI_1_validation
FOODS_3_278_WI_2_validation
FOODS_3_278_WI_3_validation
DONE


In [None]:
print("DONE")

In [3]:
items = sales["item_id"].unique()
data = []
for idx, item in enumerate(items):
    try:
        d = pd.read_csv("submission/submission_{}.csv".format(item))
    except Exception as e:
        print(e, item)
    data.append(d)
    if idx%100==0:
        print(idx)
        
df = pd.concat(data) 
# df.to_csv("submission/submission.csv")
print("DONE")

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
DONE


In [4]:
sample_submission = pd.read_csv("../data/sample_submission.csv")

In [8]:
# sample_submission[""]
dfeval = sample_submission[sample_submission.id.str.endswith('evaluation')]

In [9]:
df = pd.concat([df, dfeval]) 

In [11]:
len(df), len(sample_submission)

(60980, 60980)

In [17]:
a = (df["id"].values)
b = (sample_submission["id"].values)

set(a)==set(b)

True

In [18]:
df.to_csv("submission/submission.csv")