In [1]:
import numpy as np
import pandas as pd

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
pd.options.display.max_columns = 100

In [None]:
import pickle
with open("models/LightGBM_KAGLE.pickle", 'rb') as fin:
        model = pickle.load(fin)

In [5]:
model

<lightgbm.basic.Booster at 0x16b520cd9c8>

In [6]:
print("YEAH")

YEAH


In [7]:
def load_validation(path):

    d = pd.read_csv(path+"validation.csv")
    d.drop([
        "Unnamed: 0"
    ], axis=1, inplace=True)
    
    return reduce_mem_usage(d)



X_val = load_validation("./feat2/")
X_val.head()

Mem. usage decreased to 42.34 Mb (75.9% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,year,quarter,month,week,day,dayofweek,weekday,weekofyear,is_weekend
0,FOODS_1_001_CA_1_validation,0,0,0,0,0,0,validate,2016-04-25,11613,30,4,1,1,0,0,0,2.240234,2016,2,4,17,25,0,0,17,0
1,FOODS_1_001_CA_1_validation,0,0,0,0,0,0,validate,2016-04-26,11613,30,4,1,1,0,0,0,2.240234,2016,2,4,17,26,1,1,17,0
2,FOODS_1_001_CA_1_validation,0,0,0,0,0,0,validate,2016-04-27,11613,30,4,1,1,0,0,0,2.240234,2016,2,4,17,27,2,2,17,0
3,FOODS_1_001_CA_1_validation,0,0,0,0,0,0,validate,2016-04-28,11613,30,4,1,1,0,0,0,2.240234,2016,2,4,17,28,3,3,17,0
4,FOODS_1_001_CA_1_validation,0,0,0,0,0,0,validate,2016-04-29,11613,30,4,1,1,0,0,0,2.240234,2016,2,4,17,29,4,4,17,0


In [8]:
# get only the last 28 days

from os import listdir
from os.path import isfile, join

def get_feats(path):
    data = []
    cc = 0
    imb_cnt = 0
    for f in listdir(path):
        p = join(path, f)
        if isfile(p) and f.endswith('.csv'):
            if f.startswith("validation"):
                continue
                
            cc +=1
            d = pd.read_csv(p)
            
            lastrowdate = pd.to_datetime(d.tail(1)["date"])
            lastmonth = lastrowdate - pd.DateOffset(days=86)
            lastmonthstr = str(lastmonth.values[0])


            d = d[d['date'] >= lastmonthstr]
#             d["logdemand"] = np.log(d["demand"]+1e-5)
                
            
            d.drop(["Unnamed: 0"], axis=1, inplace=True)
            
            d.fillna(d.mean(), inplace=True)
            data.append(d)
        
        if cc%1000==0:
            d = pd.concat(data)
            d = reduce_mem_usage(d)
            
            data = [d]
            print(cc)
#     break
    data = pd.concat(data)
    return reduce_mem_usage(data)

X_train = get_feats("./feat2/")

Mem. usage decreased to 59.87 Mb (73.2% reduction)
1000
Mem. usage decreased to 119.71 Mb (73.2% reduction)
2000
Mem. usage decreased to 179.58 Mb (73.2% reduction)
3000
Mem. usage decreased to 182.51 Mb (73.2% reduction)


In [11]:
# from ipynb.fs.full.FeatureEngineering import numerical_feature
import math

def numerical_feature(df):
    for i in [7, 28]:
        df[f"shifted_t{i}"] = df[["id","demand"]].groupby('id')["demand"].shift(i)

    for win, col in [(7, "shifted_t7"), (7, "shifted_t28"), (28, "shifted_t7"), (28, "shifted_t28")]:
        df[f"rolling_mean_{col}_w{win}"] = df[["id", col]].groupby('id')[col].shift(1).rolling(win, min_periods=1).mean()

    return df



In [12]:
orgX_val, orgX_train = X_val, X_train

In [13]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]

useless_cols = ["id", "date", "demand","d", "wm_yr_wk", "is_weekend", "part"]
train_cols = orgX_train.columns[~orgX_train.columns.isin(useless_cols)]

In [14]:
train_cols

Index(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1',
       'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX',
       'snap_WI', 'sell_price', 'year', 'quarter', 'month', 'week', 'day',
       'dayofweek', 'weekday', 'weekofyear', 'shifted_t7', 'shifted_t28',
       'rolling_mean_shifted_t7_w7', 'rolling_mean_shifted_t28_w7',
       'rolling_mean_shifted_t7_w28', 'rolling_mean_shifted_t28_w28'],
      dtype='object')

In [46]:
import time

import datetime
import dateutil.relativedelta

def predict(model, X_train, X_test, factor=1):
    DATES = X_test["date"].unique()
    NDATE = len(DATES)
    print("NDATE", NDATE)
    
    col = ["id"] + ["F{}".format(i) for i in range(1, NDATE+1)]
    itemId = X_train["dept_id"].unique()
    print("#CHUNK", len(itemId))
    
    acc_o = []
    itemId = sorted(itemId)
    for iid in itemId:
        test = X_test[X_test["dept_id"]==iid]
        
        ids = test["id"].unique()
        oarr = np.zeros((len(ids), NDATE+1))
        o = pd.DataFrame(oarr, columns=col)
        
        o["id"] = test[test["date"]==DATES[0]]["id"].values
        
        train = X_train[X_train["dept_id"]==iid]
        
        ## XX=test, X=train
        lastmonth = pd.to_datetime(train.head(1)["date"])
        for idx, date in enumerate(DATES):
            
            newrow = test[test["date"]==date]
            train = train.append(newrow)
            
            train.sort_values(by=['id', "date"], inplace=True)
# #             print("num feats START")
            feat = numerical_feature(train)
            
# #             print("num feats DONE")

#             print(f"============== {idx} ==========")
#             p = feat[feat["id"]=="FOODS_1_001_CA_1_validation"]
# #             print(p)
#             print(p.tail(15)[["date", "demand", "shifted_t7"]])
#             if idx==10:
#                 return None
            
            x = feat.loc[feat["date"] == date , train_cols]
            val_pred = model.predict(x)

            
            o[f"F{idx+1}"] = val_pred*factor
            
            
            train.loc[train["date"]==date, "demand"] = val_pred*factor
            
            
            lastmonth = lastmonth + pd.DateOffset(days=1)
            train = train[train['date'] >= str(lastmonth.values[0])]
        acc_o.append(o)
        acc_o = [pd.concat(acc_o)]
        print(iid)
#         break
    
    acc_o = pd.concat(acc_o)
    return acc_o

In [47]:
pp = []
weights = [1]
# weights = [1, 1.028, 1.023, 1.018]
for w in weights:
    print("======== w",w,"==========")
    pred = predict(model, X_train, X_val, factor=w)
    pp.append(pred)


NDATE 28
#CHUNK 7
0
1
2
3
4
5
6


In [52]:

# avgpred = pd.DataFrame([])
# avgpred["id"] = pp[0]["id"]
# for i in range(1, 29):
#     avgpred[f"F{i}"] = (pp[1][f"F{i}"]+pp[2][f"F{i}"]+pp[3][f"F{i}"])/3
    
#     print(sum(avgpred[f"F{i}"]), sum(pp[0][f"F{i}"]))
    


# avgpred.head()
avgpred = pp[0]

In [51]:
data_dir = "./data/"
submission = pd.read_csv(data_dir+'sample_submission.csv')
dfeval = submission[submission.id.str.endswith('evaluation')]

In [53]:

assert len(dfeval)==len(avgpred)

In [54]:

for i in range(len(pp)):
    df = pd.concat([pp[i], dfeval]) 

    df.sort_values("id", inplace = True)
    df.reset_index(drop=True, inplace = True)
    df.to_csv(f"submission_v5.{i}.csv")


# df = pd.concat([avgpred, dfeval]) 
# df.sort_values("id", inplace = True)
# df.reset_index(drop=True, inplace = True)
# df.to_csv("submission_v5.{}.csv".format(len(pp)))

In [57]:
# thebest = pd.read_csv("submission_v4.csv")

# rmse(df, thebest)

In [59]:
baseline = pd.read_csv("./submissions/submission_v4.csv")
for i in range(1, 29):
    print(sum(avgpred[f"F{i}"]), sum(baseline[f"F{i}"]))

0.06932133752613022 79809.57150258459
0.0701459347842684 73032.06106016925
0.0712587253728682 70087.56825037877
0.06018097670487221 69489.65689680453
0.06581197320737579 81169.4594341078
0.0642935445223066 90499.71050531889
0.08951572716239548 105721.70376331158
0.06803747079452284 92999.241833135
0.0693611211494169 89539.03343881461
0.07213619478194534 78148.96961979584
0.07157368974215815 80379.09066742251
0.08398764205141548 90184.45242546794
0.07813942093314054 97434.29331885652
0.0803224173591656 85947.24714883062
0.06711658709123001 92232.92965069183
0.06625451832861123 79978.23621193491
0.06828484948887935 83346.51222238268
0.06982512980786693 83096.70544005635
0.08205720848306483 89388.88314819924
0.0787774422286739 103181.92058601619
0.08127201590182043 109341.21683304248
0.07415458923251537 81800.49138562103
0.07147277493236398 73577.84056780583
0.07433885925833869 71358.02545632045
0.07560560276554985 70808.65834056292
0.08636684235861886 80823.42410935761
0.0841503975938885

# Check check check

In [27]:

intX_train = X_train[X_train['date'] < '2016-04-15']
intX_val = X_train[X_train['date'] >= '2016-04-15']
pred = predict(model, intX_train, intX_val)
pred["id"] = le.inverse_transform(pred["id"])

pred.to_csv("internal_submission.csv")

NDATE 10
7
0
1
2
3
4
5
6


In [28]:
groudtruth = sales[["id", "d_1904", "d_1905", "d_1906", "d_1907", "d_1908", "d_1909", "d_1910", "d_1911", "d_1912", "d_1913"]]

def transform(df):
    newdf = df.melt(id_vars=["id"], var_name="d", value_name="sale")
    newdf.sort_values(by=['id', "d"], inplace=True)
    newdf.reset_index(inplace=True)
    return newdf

from sklearn.metrics import mean_squared_error

def rmse(df, gt):
    df = transform(df)
    gt = transform(gt)
    return mean_squared_error(df["sale"], gt["sale"])


In [32]:
sample = pd.read_csv("sample.csv")
sample.head()

Unnamed: 0,id,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,0


In [33]:
my_sample = pd.read_csv("internal_submission.csv")
my_sample.drop(["Unnamed: 0"], axis=1, inplace=True)
my_sample.columns = sample.columns
# my_sample.head()
rmse(my_sample, groudtruth)

1158.7093813381214

In [34]:
m = pd.DataFrame([], columns=sample.columns)
for col in sample.columns:
    if col=="id":
        continue
    m[col] = my_sample[col]
    
rmse(m, groudtruth)

1162.0487741133481