In [1]:
import numpy as np
import pandas as pd

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
pd.options.display.max_columns = 100

### Load dataframe from disk

In [6]:
from os import listdir
from os.path import isfile, join

def get_feats(path):
    data = []
    cc = 0
    imb_cnt = 0
    for f in listdir(path):
        p = join(path, f)
        if isfile(p) and f.endswith('.csv'):
            if f.startswith("validation"):
                continue
                
            cc +=1
            d = pd.read_csv(p)
            
#             d["logdemand"] = np.log(d["demand"]+1e-5)
                
            
            d.drop(["Unnamed: 0"], axis=1, inplace=True)
            
            d.fillna(d.mean(), inplace=True)
            data.append(d)
        
        if cc%1000==0:
            d = pd.concat(data)
            d = reduce_mem_usage(d)
            
            data = [d]
            print(cc)
            
        if cc > 1:
            break
#     break
            
    return data

data = get_feats("./feat2/")
data = pd.concat(data)
data = reduce_mem_usage(data)
data.head()

Mem. usage decreased to  1.01 Mb (76.3% reduction)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,year,quarter,month,week,day,dayofweek,weekday,weekofyear,is_weekend,shifted_t1,shifted_t2,shifted_t3,shifted_t7,shifted_t28,rolling_mean_demand_w7,rolling_mean_demand_w28,rolling_mean_shifted_t7_w7,rolling_mean_shifted_t28_w7,rolling_mean_shifted_t7_w28,rolling_mean_shifted_t28_w28
0,0,0,0,0,0,0,1,train,2014-05-12,11415,30,4,1,1,0,1,1,2.240234,2014,2,5,20,12,0,0,20,0,0.53418,0.532227,0.532715,0.532715,0.532227,0.533691,0.534668,0.52832,0.535156,0.529297,0.537109
1,0,0,0,0,0,0,2,train,2014-05-13,11415,30,4,1,1,0,1,0,2.240234,2014,2,5,20,13,1,1,20,0,1.0,0.532227,0.532715,0.532715,0.532227,1.0,1.0,0.52832,0.535156,0.529297,0.537109
2,0,0,0,0,0,0,0,train,2014-05-14,11415,30,4,1,1,0,0,1,2.240234,2014,2,5,20,14,2,2,20,0,2.0,1.0,0.532715,0.532715,0.532227,1.5,1.5,0.52832,0.535156,0.529297,0.537109
3,0,0,0,0,0,0,0,train,2014-05-15,11415,30,4,1,1,0,1,1,2.240234,2014,2,5,20,15,3,3,20,0,0.0,2.0,1.0,0.532715,0.532227,1.0,1.0,0.52832,0.535156,0.529297,0.537109
4,0,0,0,0,0,0,0,train,2014-05-16,11415,30,4,1,1,0,0,0,2.240234,2014,2,5,20,16,4,4,20,0,0.0,0.0,2.0,0.532715,0.532227,0.75,0.75,0.52832,0.535156,0.529297,0.537109


In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

nacol = data.isna().sum()*100/len(data)
nacol[nacol>1]

Series([], dtype: float64)

In [6]:
# data.info()

In [8]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]

useless_cols = ["id", "date", "demand","d", "wm_yr_wk", "weekday", "is_weekend", "part", "day"]
train_cols = data.columns[~data.columns.isin(useless_cols)]
X_train = data[train_cols]
y_train = data["demand"]

# Train Model

In [9]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb


def train(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.01)
    
#     params = {
#         'boosting_type': 'gbdt',
# #         "device_type": "gpu",
#         'metric': 'rmse',
#         'objective': 'poisson',
#         'n_jobs': -1,
#         'seed': 20,
#         'learning_rate': 0.1,
#         'alpha': 0.1,
#         'lambda': 0.1,
#         'bagging_fraction': 0.66,
#         'bagging_freq': 2, 
#         'colsample_bytree': 0.77}

    params = {
            "objective" : "poisson",
            "metric" :"rmse",
            "force_row_wise" : True,
            "learning_rate" : 0.075,
    #         "sub_feature" : 0.8,
            "sub_row" : 0.75,
            "bagging_freq" : 1,
            "lambda_l2" : 0.1,
    #         "nthread" : 4
            "metric": ["rmse"],
        'verbosity': 1,
        'num_iterations' : 1200,
        'num_leaves': 128,
        "min_data_in_leaf": 100,
    }

    train_set = lgb.Dataset(x_train, y_train)
    test_set = lgb.Dataset(x_test, y_test)

    model = lgb.train(params, train_set, early_stopping_rounds = 500, valid_sets = [test_set], verbose_eval = 100)
    # joblib.dump(model, 'lgbm_0.sav')

    val_pred = model.predict(x_test, num_iteration=model.best_iteration)
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_test))
    return model, val_score

In [12]:
# del data
model, rmse = train(X_train, y_train)

In [10]:
import pickle 
dbfile = open('models/LightGBM_FF.pickle', 'wb') 
pickle.dump(model, dbfile)
dbfile.close() #Dont forget this 

In [11]:
import pickle
with open("models/LightGBM_FF.pickle", 'rb') as fin:
        model = pickle.load(fin)

In [12]:
model

<lightgbm.basic.Booster at 0x1f7990bbe48>

In [13]:
print("YEAH")

YEAH


In [14]:
def load_validation(path):

    d = pd.read_csv(path+"validation.csv")
    d.drop([
        "Unnamed: 0"
    ], axis=1, inplace=True)
    
    values = {'event_name_1': -1, 'event_type_1': -1, "event_name_2": -1, 'event_type_2': -1}
    d.fillna(value=values, inplace = True);
    for col in values.keys():
        d[col] = d[col]+1

    return reduce_mem_usage(d)



X_val = load_validation("./feat2/")
X_val.head()
X_val.drop(["demand", "part", "wm_yr_wk"], axis=1)

X_train = data
X_train['date'] = pd.to_datetime(X_train['date'])
X_val['date'] = pd.to_datetime(X_val['date'])

Mem. usage decreased to 40.71 Mb (76.9% reduction)


In [15]:
import math

def numerical_feature(df):
    for i in [1, 2, 3, 7, 28]:
        df[f"shifted_t{i}"] = df[["id","demand"]].groupby('id')["demand"].shift(i)

    for win, col in [(7, "demand"), (28, "demand"), (7, "shifted_t7"), (7, "shifted_t28"), (28, "shifted_t7"), (28, "shifted_t28")]:
#         df[f"rolling_mean_{col}_w{win}"] = df[["id", col]].groupby('id')[col].transform(lambda x : x.rolling(win).mean())
        df[f"rolling_mean_{col}_w{win}"] = df[["id", col]].groupby('id')[col].shift(1).rolling(win, min_periods=1).mean()

    return df



In [16]:
import time

import datetime
import dateutil.relativedelta

def predict(model, X_train, X_test, factor=1):
    lastrowdate = X_train.tail(1)["date"]
    lastmonth = lastrowdate - pd.DateOffset(months=2)
    lastmonthstr = str(lastmonth.values[0])
    
    
    X1 = X_train[X_train['date'] >= lastmonthstr] #last month
    
    
    DATES = X_test["date"].unique()
    NDATE = len(DATES)
    print("NDATE", NDATE)
    
    col = ["id"] + ["F{}".format(i) for i in range(1, NDATE+1)]

    
    
    
    itemId = X_train["dept_id"].unique()
    print(len(itemId))
    
    acc_o = []
    for iid in itemId:
        XX = X_test[X_test["dept_id"]==iid]
        
        ids = XX["id"].unique()
        oarr = np.zeros((len(ids), NDATE+1))
        o = pd.DataFrame(oarr, columns=col)
        
        o["id"] = XX[XX["date"]==DATES[0]]["id"].values
        
        X = X1[X1["dept_id"]==iid]
        
        
        for idx, date in enumerate(DATES):
            
            X2 = XX[XX["date"]==date]

            X = X.append(X2)
#             X.sort_values(by=['id', "date"], inplace=True)

#             print("num feats START")
#             print(len(X))
            Xfeat = numerical_feature(X)
    #         for i in ids:
    #             print(i)
    #             subx = Xfeat[Xfeat["id"]==i]
    #             subx.fillna(subx.mean(), inplace=True)
    #             Xfeat[Xfeat["id"]==i] = subx
            
#             print("num feats DONE")

#             print(f"============== {idx} ==========")
#             print(Xfeat[Xfeat["id"]==0][["demand", "shifted_t1", "shifted_t2"]].tail())
#             if idx==10:
#                 return None
            
            x = Xfeat[Xfeat["date"]==date]
            x = x.drop(["id", "demand", "part", "date", "wm_yr_wk"], axis=1)
            val_pred = model.predict(x, num_iteration=model.best_iteration)

#             print(idx, sum(val_pred))
            o[f"F{idx+1}"] = val_pred*factor
            X.loc[X["date"]==date, "demand"] = val_pred*factor
            
#             print((X[X["date"]==date]["demand"]).head())

            lastmonth = lastmonth + pd.DateOffset(days=1)
            X = X[X['date'] >= str(lastmonth.values[0])]
        acc_o.append(o)
        acc_o = [pd.concat(acc_o)]
        print(iid)
#         break
    
    acc_o = pd.concat(acc_o)
    return acc_o


In [17]:
# lastrowdate = X_train.tail(1)["date"]
# lastmonth = lastrowdate - pd.DateOffset(months=2)
# lastmonthstr = str(lastmonth.values[0])


# X1 = X_train[X_train['date'] >= lastmonthstr] #last month

# X1[X1["id"]==0][["demand", "shifted_t1", "shifted_t2"]].tail(5)

In [18]:
weights = [1, 1.028, 1.023, 1.018]

pp = []

X_train.sort_values(by=['id', "date"], inplace=True)
X_val.sort_values(by=['id', "date"], inplace=True)


for w in weights:
    print("======== w",w,"==========")
    pred = predict(model, X_train, X_val, factor=w)
    pp.append(pred)

NDATE 28
7
0
1
2
3
4
5
6
NDATE 28
7
0
1
2
3
4
5
6
NDATE 28
7
0
1
2
3
4
5
6
NDATE 28
7
0
1
2
3
4
5
6


In [21]:

avgpred = pd.DataFrame([])
avgpred["id"] = pp[0]["id"]
for i in range(1, 29):
    avgpred[f"F{i}"] = (pp[1][f"F{i}"]+pp[2][f"F{i}"]+pp[3][f"F{i}"])/3
    
    print(sum(avgpred[f"F{i}"]), sum(pp[0][f"F{i}"]))
    


avgpred.head()

57306.841596077335 56018.41798248039
59422.09379218819 56588.115363678444
102986.20005560313 95814.87379277141
160145.286673631 147220.9756617912
261888.5675908639 240323.34564011905
374697.6761839478 341419.7270649864
588109.7344405901 539878.564774468
749528.2526442701 684429.6747482939
839507.1394387898 767944.7753183242
778284.4764607965 708431.7786942757
731554.5607191272 659859.6077436751
858241.9455030877 766225.4317685403
1001813.6592890947 923735.897283136
1067560.4949683803 1006147.532100601
1027004.558691655 975798.1412344533
854439.5072514105 814035.3397512542
682398.1260181566 648719.2933545044
539109.3240026557 507191.30856385326
472234.0394470496 440609.94318159274
540600.8275006405 502935.5569399477
655512.0243950001 609396.3927301776
610868.1701067837 569213.3329188706
535724.4711824281 497640.2657489744
433193.1577116492 400190.6725109092
340978.57612316275 310963.1834773864
326409.388572011 301639.30957472575
411364.1497531371 382920.6898160761
551745.9627892597 5197

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,0,2.55511,4.694795,4.301314,12.239157,26.608289,25.505579,10.634571,1.692518,0.06335,0.087756,2.165215,3.986981,4.248863,6.310464,32.347146,98.767332,125.172375,64.676808,6.095535,0.517236,3.94564,78.660399,123.214106,48.129233,3.72592,0.1408,0.009885,0.006305
1,1,2.426454,4.551822,2.991314,8.292203,23.708189,34.643234,17.958775,3.420734,0.135335,0.061626,1.13954,3.789993,4.35019,4.03879,17.67916,55.465347,137.446891,87.385118,17.306038,1.351387,1.726818,44.939768,137.543007,109.310059,8.268706,0.635804,0.017624,0.010883
2,2,2.416024,5.893613,4.672186,6.988465,22.776355,47.699968,39.180341,7.515096,0.413509,0.075284,0.111053,2.507854,4.728799,4.062653,13.31571,56.216842,142.378828,95.840047,29.439441,3.391669,0.360062,17.003741,114.711617,141.642473,21.594106,2.470464,0.052932,0.007572
3,3,2.039567,5.769382,5.016031,5.130525,12.487033,23.093198,38.262714,16.501875,1.892313,0.221179,0.048501,1.513695,3.776965,3.781654,9.008519,31.235892,108.462939,126.135155,47.10826,4.394046,0.188538,4.428498,73.554758,77.001741,55.241614,5.336405,0.161176,0.011197
4,4,2.236148,5.405795,5.840262,4.823511,10.960357,23.96824,46.009392,29.716915,7.209576,0.49973,0.043428,0.277077,2.310177,3.951728,6.540332,22.204613,89.262024,127.470279,96.234945,12.186693,0.731755,0.914228,33.549405,89.87128,83.077999,10.106864,0.686644,0.014306


In [22]:
from sklearn import preprocessing

data_dir = "./data/"
sales = pd.read_csv(data_dir+'sales_train_validation.csv')


le = preprocessing.LabelEncoder()
le.fit_transform(sales["id"])

array([14370, 14380, 14390, ..., 14349, 14359, 14369])

In [23]:
for i in range(len(pp)):
    pp[i]["id"] = le.inverse_transform(pp[i]["id"])
    print(pp[i]["id"].head())
    
avgpred["id"] = le.inverse_transform(avgpred["id"])
avgpred.head()

0    FOODS_1_001_CA_1_validation
1    FOODS_1_001_CA_2_validation
2    FOODS_1_001_CA_3_validation
3    FOODS_1_001_CA_4_validation
4    FOODS_1_001_TX_1_validation
Name: id, dtype: object
0    FOODS_1_001_CA_1_validation
1    FOODS_1_001_CA_2_validation
2    FOODS_1_001_CA_3_validation
3    FOODS_1_001_CA_4_validation
4    FOODS_1_001_TX_1_validation
Name: id, dtype: object
0    FOODS_1_001_CA_1_validation
1    FOODS_1_001_CA_2_validation
2    FOODS_1_001_CA_3_validation
3    FOODS_1_001_CA_4_validation
4    FOODS_1_001_TX_1_validation
Name: id, dtype: object
0    FOODS_1_001_CA_1_validation
1    FOODS_1_001_CA_2_validation
2    FOODS_1_001_CA_3_validation
3    FOODS_1_001_CA_4_validation
4    FOODS_1_001_TX_1_validation
Name: id, dtype: object


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,2.55511,4.694795,4.301314,12.239157,26.608289,25.505579,10.634571,1.692518,0.06335,0.087756,2.165215,3.986981,4.248863,6.310464,32.347146,98.767332,125.172375,64.676808,6.095535,0.517236,3.94564,78.660399,123.214106,48.129233,3.72592,0.1408,0.009885,0.006305
1,FOODS_1_001_CA_2_validation,2.426454,4.551822,2.991314,8.292203,23.708189,34.643234,17.958775,3.420734,0.135335,0.061626,1.13954,3.789993,4.35019,4.03879,17.67916,55.465347,137.446891,87.385118,17.306038,1.351387,1.726818,44.939768,137.543007,109.310059,8.268706,0.635804,0.017624,0.010883
2,FOODS_1_001_CA_3_validation,2.416024,5.893613,4.672186,6.988465,22.776355,47.699968,39.180341,7.515096,0.413509,0.075284,0.111053,2.507854,4.728799,4.062653,13.31571,56.216842,142.378828,95.840047,29.439441,3.391669,0.360062,17.003741,114.711617,141.642473,21.594106,2.470464,0.052932,0.007572
3,FOODS_1_001_CA_4_validation,2.039567,5.769382,5.016031,5.130525,12.487033,23.093198,38.262714,16.501875,1.892313,0.221179,0.048501,1.513695,3.776965,3.781654,9.008519,31.235892,108.462939,126.135155,47.10826,4.394046,0.188538,4.428498,73.554758,77.001741,55.241614,5.336405,0.161176,0.011197
4,FOODS_1_001_TX_1_validation,2.236148,5.405795,5.840262,4.823511,10.960357,23.96824,46.009392,29.716915,7.209576,0.49973,0.043428,0.277077,2.310177,3.951728,6.540332,22.204613,89.262024,127.470279,96.234945,12.186693,0.731755,0.914228,33.549405,89.87128,83.077999,10.106864,0.686644,0.014306


In [24]:
submission = pd.read_csv(data_dir+'sample_submission.csv')
dfeval = submission[submission.id.str.endswith('evaluation')]

In [25]:
assert len(dfeval)==len(avgpred)

In [26]:

for i in range(len(pp)):
    df = pd.concat([pp[i], dfeval]) 

    df.sort_values("id", inplace = True)
    df.reset_index(drop=True, inplace = True)
    df.to_csv(f"submission_v5.{i}.csv")


df = pd.concat([avgpred, dfeval]) 
df.sort_values("id", inplace = True)
df.reset_index(drop=True, inplace = True)
df.to_csv("submission_v5.{}.csv".format(len(pp)))

In [35]:
thebest = pd.read_csv("submission_v4.csv")

rmse(df, thebest)

787.8267618566862

# Check check check

In [27]:

intX_train = X_train[X_train['date'] < '2016-04-15']
intX_val = X_train[X_train['date'] >= '2016-04-15']
pred = predict(model, intX_train, intX_val)
pred["id"] = le.inverse_transform(pred["id"])

pred.to_csv("internal_submission.csv")

NDATE 10
7
0
1
2
3
4
5
6


In [28]:
groudtruth = sales[["id", "d_1904", "d_1905", "d_1906", "d_1907", "d_1908", "d_1909", "d_1910", "d_1911", "d_1912", "d_1913"]]

def transform(df):
    newdf = df.melt(id_vars=["id"], var_name="d", value_name="sale")
    newdf.sort_values(by=['id', "d"], inplace=True)
    newdf.reset_index(inplace=True)
    return newdf

from sklearn.metrics import mean_squared_error

def rmse(df, gt):
    df = transform(df)
    gt = transform(gt)
    return mean_squared_error(df["sale"], gt["sale"])


In [32]:
sample = pd.read_csv("sample.csv")
sample.head()

Unnamed: 0,id,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,0


In [33]:
my_sample = pd.read_csv("internal_submission.csv")
my_sample.drop(["Unnamed: 0"], axis=1, inplace=True)
my_sample.columns = sample.columns
# my_sample.head()
rmse(my_sample, groudtruth)

1158.7093813381214

In [34]:
m = pd.DataFrame([], columns=sample.columns)
for col in sample.columns:
    if col=="id":
        continue
    m[col] = my_sample[col]
    
rmse(m, groudtruth)

1162.0487741133481