# Set up LightGBM on GPU

In [1]:
# !rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
# !git clone --recursive https://github.com/Microsoft/LightGBM


In [None]:
# %%bash
# cd LightGBM
# mkdir build ; cd build
# cmake ..
# make -j4

In [2]:
# !apt-get install -y -qq libboost-all-dev

In [3]:
# %%bash
# cd LightGBM
# rm -r build
# mkdir build
# cd build
# cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
# make -j$(nproc)


In [4]:
# !cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
# !mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
# !rm -r LightGBM

# Running Model

In [1]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
pd.options.display.max_columns = 50

In [4]:
VERSION=15

# Load datasets

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
data_dir = "/content/drive/My Drive/M5/Data/"
calendar = pd.read_csv(data_dir+'calendar.csv')
calendar = reduce_mem_usage(calendar)
print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))

sell_prices = pd.read_csv(data_dir+'sell_prices.csv')
sell_prices = reduce_mem_usage(sell_prices)
print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))

sales_validate = pd.read_csv(data_dir+'sales_train_validation.csv')
print('Sales train validation has {} rows and {} columns'.format(sales_validate.shape[0], sales_validate.shape[1]))

sales_eval = pd.read_csv(data_dir+'sales_train_evaluation.csv')
print('Sales train validation has {} rows and {} columns'.format(sales_eval.shape[0], sales_eval.shape[1]))

Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns
Sales train validation has 30490 rows and 1919 columns
Sales train validation has 30490 rows and 1947 columns


In [71]:
sales_validate.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,4,2,3,0,1,2,0,0,0,1,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0,0,0,0,1,2,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,3,1,2,1,3,1,0,2,5,4,2,0,3,0,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,4,0,1,4,0,1,0,1,0,1,1,2,0,1,1,2,1,1,0,1,1,2,2,2,4


In [72]:
sales_eval.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,3,5,0,0,1,1,0,2,1,2,2,1,0,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,2,1,0,0,0,0,2,1,3,0,0,1,0,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,4,1,6,4,0,0,0,2,2,4,2,1,1,1,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3,1,0,3,2,3,1,1,3,2,3,2,2,2,2,0,0,0,2,1,0,0,2,1,0


In [73]:
sales = sales_eval

In [74]:
idCols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
product = sales[idCols].drop_duplicates()

In [75]:
import warnings
warnings.filterwarnings('ignore')

submission = pd.read_csv(data_dir+'sample_submission.csv')
validate_submission = submission[submission.id.str.endswith('validation')]
eval_submission = submission[submission.id.str.endswith('evaluation')]

# change column name
newcolumns = ["id"] + ["d_{}".format(i) for i in range(1914, 1914+28)]
validate_submission.columns = newcolumns
validate_submission['id'] = validate_submission['id'].str.replace('_validation', "_evaluation")
validate_submission = validate_submission.merge(product, how = 'left', on = 'id')

newcolumns = ["id"] + ["d_{}".format(i) for i in range(1942, 1942+28)]
eval_submission.columns = newcolumns
eval_submission = eval_submission.merge(product, how = 'left', on = 'id')


In [76]:
# product.head()

In [77]:
submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [78]:
idCols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

# Use only the last three years
DAYS = int(365*4); LAST_DAY=1941
dayCols = ["d_{}".format(i) for i in range(LAST_DAY-DAYS+1, LAST_DAY+1)]
print(len(dayCols), dayCols[0])
sales = sales[idCols+dayCols]
print(sales.shape)
sales.head()

1460 d_482
(30490, 1466)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_482,d_483,d_484,d_485,d_486,d_487,d_488,d_489,d_490,d_491,d_492,d_493,d_494,d_495,d_496,d_497,d_498,d_499,d_500,...,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,3,5,0,0,1,1,0,2,1,2,2,1,0,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,2,1,0,0,0,0,2,1,3,0,0,1,0,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,2,3,1,0,0,1,1,1,3,1,0,1,1,0,2,0,5,1,...,2,4,1,6,4,0,0,0,2,2,4,2,1,1,1,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,2,5,3,2,2,3,1,3,1,1,2,0,1,2,2,2,5,...,3,1,0,3,2,3,1,1,3,2,3,2,2,2,2,0,0,0,2,1,0,0,2,1,0


In [79]:
def melted(df, name=""):
    df = pd.melt(df, id_vars = idCols, var_name = 'day', value_name = 'demand')
    print('{}: {} rows and {} columns'.format(name, df.shape[0], df.shape[1]))
    df = reduce_mem_usage(df)
    # df.to_csv(name+".csv")
    return df

melted_sales = melted(sales)
melted_sales["part"] = "train"
melted_validate = melted(validate_submission)
melted_validate["part"] = "validate"
melted_eval = melted(eval_submission)
melted_eval["part"] = "evaluate"

: 44515400 rows and 8 columns
Mem. usage decreased to 2462.29 Mb (9.4% reduction)
: 853720 rows and 8 columns
Mem. usage decreased to 46.41 Mb (10.9% reduction)
: 853720 rows and 8 columns
Mem. usage decreased to 46.41 Mb (10.9% reduction)


In [80]:
melted_eval.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0,evaluate
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0,evaluate
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0,evaluate
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0,evaluate
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0,evaluate


In [81]:
# data = melted_sales
data = pd.concat([melted_sales, melted_validate, melted_eval], axis = 0)
# data = pd.concat([melted_sales, melted_validate], axis = 0)
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_482,0,train
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_482,0,train
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_482,0,train
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_482,1,train
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_482,0,train


In [82]:
import gc
del melted_sales, melted_validate
del submission, validate_submission, eval_submission, product
del sales,
gc.collect()

692

In [83]:
# merge with calendar, sell_prices

calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
data.drop(['d', 'day'], inplace = True, axis = 1)

print('Our dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))

# get the sell price data (this feature should be very important)
# version 12: left join sell_prices
data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))


Our dataset to train has 46222840 rows and 17 columns
Our final dataset to train has 46222840 rows and 18 columns


In [84]:
del calendar, sell_prices
gc.collect()

0

In [85]:
na_sell_price = data["sell_price"].isnull().sum()/len(data) #= df[['a','b']].fillna(value=0)
print("Sell price NA: ", na_sell_price*100, "%")

data["sell_price"] = data["sell_price"].fillna(value=0)

Sell price NA:  11.570641267390753 %


# Feature Engineering

In [86]:
from sklearn import preprocessing
def encode_categorical(dt, cols):
    for col in cols:
        # Leave NaN as it is.
#         le = preprocessing.LabelEncoder()
#         not_null = df[col][df[col].notnull()]
#         df[col] = pd.Series(le.fit_transform(not_null), index=not_null.index)
        
#         np.save(f'label_class_{col}.npy', le.classes_)
        dt[col] = dt[col].astype("category").cat.codes.astype("int16")
        dt[col] -= dt[col].min()
    return dt


data = encode_categorical(data, ["cat_id", "dept_id", "item_id", "state_id", "store_id"]).pipe(reduce_mem_usage)
values = {'event_name_1': "normal", 'event_type_1': "normal", "event_name_2": "normal", 'event_type_2': "normal"}
data.fillna(value=values, inplace = True);

data = encode_categorical(data, ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]).pipe(reduce_mem_usage)

Mem. usage decreased to 3482.44 Mb (4.8% reduction)
Mem. usage decreased to 2248.16 Mb (7.3% reduction)


In [87]:
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,0,train,2012-05-24,11217,30,4,3,2,0,0,0,0.0
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,0,train,2012-05-24,11217,30,4,3,2,0,0,0,3.970703
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,0,train,2012-05-24,11217,30,4,3,2,0,0,0,0.0
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,1,train,2012-05-24,11217,30,4,3,2,0,0,0,4.339844
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,0,train,2012-05-24,11217,30,4,3,2,0,0,0,2.980469


In [88]:
def datetime_features(df):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    attrs = [
        "year", 
        "quarter", 
        "month", 
        "week", 
        "day", 
        "dayofweek", 
        "weekday",
        "weekofyear",
#         "is_year_end", 
#         "is_year_start", 
#         "is_quarter_end", 
#         "is_quarter_start", 
#         "is_month_end",
#         "is_month_start",
    ]

    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        df[attr] = getattr(df['date'].dt, attr).astype(dtype)
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
    
    return df

data = datetime_features(data)
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,part,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,year,quarter,month,week,day,dayofweek,weekday,weekofyear,is_weekend
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,0,train,2012-05-24,11217,30,4,3,2,0,0,0,0.0,2012,2,5,21,24,3,3,21,0
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,0,train,2012-05-24,11217,30,4,3,2,0,0,0,3.970703,2012,2,5,21,24,3,3,21,0
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,0,train,2012-05-24,11217,30,4,3,2,0,0,0,0.0,2012,2,5,21,24,3,3,21,0
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,1,train,2012-05-24,11217,30,4,3,2,0,0,0,4.339844,2012,2,5,21,24,3,3,21,0
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,0,train,2012-05-24,11217,30,4,3,2,0,0,0,2.980469,2012,2,5,21,24,3,3,21,0


In [89]:
data.sort_values(by=['id', "date"], inplace=True)

X_train = data[data["part"]=="train"]
X_val = data[data["part"]=="validate"]
X_eval = data[data["part"]=="evaluate"]

print(len(X_train), len(X_val), len(X_eval))
del data; gc.collect()

44515400 853720 853720


0

In [90]:
import math

def numerical_feature(df):
    for i in [ 7, 28]:
        df[f"shifted_t{i}"] = df[["id","demand"]].groupby('id')["demand"].shift(i)

    for win, col in [(7, "shifted_t7"), (7, "shifted_t28"), (28, "shifted_t7"), (28, "shifted_t28")]:
        df[f"rolling_mean_{col}_w{win}"] = df[["id", col]].groupby('id')[col].shift(1).rolling(win, min_periods=1).mean()
    return df



In [91]:
import pickle 
dbfile = open(data_dir+f'../modelrecur/X_train.pkl', 'wb') 
pickle.dump(X_train, dbfile)
dbfile.close() #Dont forget this 

dbfile = open(data_dir+f'../modelrecur/X_val.pkl', 'wb') 
pickle.dump(X_val, dbfile)
dbfile.close() #Dont forget this 

dbfile = open(data_dir+f'../modelrecur/X_eval.pkl', 'wb') 
pickle.dump(X_eval, dbfile)
dbfile.close() #Dont forget this 

In [28]:
%%time

X_train = numerical_feature(X_train)
X_train.dropna(inplace = True)
gc.collect()

CPU times: user 48.6 s, sys: 3.71 s, total: 52.3 s
Wall time: 52.3 s


In [42]:
X_train.shape, X_train.columns

((44515400, 27),
 Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'demand',
        'part', 'date', 'wm_yr_wk', 'event_name_1', 'event_type_1',
        'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI',
        'sell_price', 'year', 'quarter', 'month', 'week', 'day', 'dayofweek',
        'weekday', 'weekofyear', 'is_weekend'],
       dtype='object'))

# Train Model

In [43]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "part", "date", "demand","d", "wm_yr_wk", "weekday"]
train_cols = X_train.columns[~X_train.columns.isin(useless_cols)]
train_cols 

# y_train = X_train["demand"]
# X_train = X_train[train_cols]


Index(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1',
       'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX',
       'snap_WI', 'sell_price', 'year', 'quarter', 'month', 'week', 'day',
       'dayofweek', 'weekofyear', 'is_weekend'],
      dtype='object')

In [31]:
%%time
from sklearn.model_selection import train_test_split
np.random.seed(777)

X, x_test, Y, y_test = train_test_split(X_train, y_train, test_size=0.05)

CPU times: user 21.7 s, sys: 58.4 ms, total: 21.8 s
Wall time: 21.8 s


In [33]:
# dir()

In [34]:
# version 8.0: use poisson
# params = {
#         "objective" : "poisson",
#         "metric" :"rmse",
#         "force_row_wise" : True,
#         "learning_rate" : 0.075,
# #         "sub_feature" : 0.8,
#         "sub_row" : 0.75,
#         "bagging_freq" : 1,
#         "lambda_l2" : 0.1,
# #         "nthread" : 4
#         "metric": ["rmse"],
#     'verbosity': 1,
#     'num_iterations' : 3000,
#     'num_leaves': 128,
#     "min_data_in_leaf": 100,
#     'device': 'gpu',
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0
# }

# version 11.0: use tweedie
params = {
        "objective" : "tweedie",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
        "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 3000,
    'num_leaves': 128,
    "min_data_in_leaf": 104,
    #     'device': 'gpu',
    # 'gpu_platform_id': 0,
    # 'gpu_device_id': 0
}


In [35]:
del X_train, y_train
del X_eval, X_val
gc.collect()

train_set = lgb.Dataset(X, Y)
test_set = lgb.Dataset(x_test, y_test)
del X, Y, x_test, y_test; gc.collect()

0

In [36]:
dbfile = open(data_dir+f'train_set.pkl', 'wb') 
pickle.dump(train_set, dbfile)
dbfile.close() #Dont forget this 

dbfile = open(data_dir+f'test_set.pkl', 'wb') 
pickle.dump(test_set, dbfile)
dbfile.close() #Dont forget this 

In [None]:
# %%time

# # model = lgb.train(params, train_set, num_boost_round = 3000, early_stopping_rounds = 500, valid_sets = [train_set, test_set], verbose_eval = 100)
model = lgb.train(params, train_set, valid_sets = [test_set], verbose_eval = 100)

[100]	valid_0's rmse: 2.24776
[200]	valid_0's rmse: 2.21999
[300]	valid_0's rmse: 2.20049
[400]	valid_0's rmse: 2.18638
[500]	valid_0's rmse: 2.17653
[600]	valid_0's rmse: 2.17013
[700]	valid_0's rmse: 2.16301
[800]	valid_0's rmse: 2.15738
[900]	valid_0's rmse: 2.15197
[1000]	valid_0's rmse: 2.14704
[1100]	valid_0's rmse: 2.142
[1200]	valid_0's rmse: 2.13816
[1300]	valid_0's rmse: 2.13542
[1400]	valid_0's rmse: 2.13261
[1500]	valid_0's rmse: 2.12995
[1600]	valid_0's rmse: 2.1271
[1700]	valid_0's rmse: 2.12442
[1800]	valid_0's rmse: 2.12164
[1900]	valid_0's rmse: 2.11904
[2000]	valid_0's rmse: 2.11743
[2100]	valid_0's rmse: 2.1155
[2200]	valid_0's rmse: 2.11353
[2300]	valid_0's rmse: 2.11184
[2400]	valid_0's rmse: 2.1104
[2500]	valid_0's rmse: 2.10849
[2600]	valid_0's rmse: 2.10681
[2700]	valid_0's rmse: 2.10587
[2800]	valid_0's rmse: 2.10497
[2900]	valid_0's rmse: 2.10336
[3000]	valid_0's rmse: 2.10249


In [None]:
model.save_model(data_dir+f"modelv{VERSION}.lgb")

<lightgbm.basic.Booster at 0x7f359fac8eb8>

In [None]:
import pickle 
dbfile = open(data_dir+f'LightGBMv{VERSION}.pickle', 'wb') 
pickle.dump(model, dbfile)
dbfile.close() #Dont forget this 

In [None]:
# import pickle
# # with open("../input/pretrained-model-v2/LightGBM_FF.pickle", 'rb') as fin:
# #         model = pickle.load(fin)
# with open("../input/m5-pretrained-model/LightGBM_KAGLE.pickle", 'rb') as fin:
#         model = pickle.load(fin)

In [25]:
print("YEAH")

YEAH


# Prediction

In [40]:
import pickle 
with open(data_dir+f'X_train.pkl', 'rb') as fin:
        X_train = pickle.load(fin)

with open(data_dir+f'X_val.pkl', 'rb') as fin:
        X_val = pickle.load(fin)

with open(data_dir+f'X_eval.pkl', 'rb') as fin:
        X_eval = pickle.load(fin)

In [92]:
with open(data_dir+f'../Submissions/LightGBMv11.pickle', 'rb') as fin:
        model = pickle.load(fin)

In [93]:
%%time
lastdate = X_train["date"].max() - pd.DateOffset(days=86)
X_train = X_train[X_train['date'] >= lastdate]

CPU times: user 738 ms, sys: 137 ms, total: 875 ms
Wall time: 976 ms


In [135]:
import time

import datetime
import dateutil.relativedelta

def predict(model, X_train, X_test, factor=1):
    DATES = X_test["date"].unique()
    NDATE = len(DATES)
    print("NDATE", NDATE)
    
    col = ["id"] + ["F{}".format(i) for i in range(1, NDATE+1)]
    itemId = X_train["dept_id"].unique()
    print("#CHUNK", len(itemId))
    
    acc_o = []

    maxdate = X_test["date"].min()
    mindate = X_test["date"].min() - pd.DateOffset(days=40)

    ## select only date before X_test
    XX_train = X_train[X_train['date'] > mindate]
    XX_train = XX_train[XX_train['date'] < maxdate]
    
    itemId = sorted(itemId)
    for iid in itemId:
        test = X_test[X_test["dept_id"]==iid]

        ids = test["id"].unique()
        oarr = np.zeros((len(ids), NDATE+1))
        o = pd.DataFrame(oarr, columns=col)
        
        o["id"] = test[test["date"]==DATES[0]]["id"].values
        
        train = XX_train[XX_train["dept_id"]==iid]
        ## XX=test, X=train
        lastmonth = pd.to_datetime(train.head(1)["date"])
        
        # print(test.shape)
        # print(train.tail())
        for idx, date in enumerate(DATES):
            
            newrow = test[test["date"]==date]            
            train = train.append(newrow)
            train.sort_values(by=['id', "date"], inplace=True)
# #             print("num feats START")
            feat = numerical_feature(train)
            
# #             print("num feats DONE")

#             print(f"============== {idx} ==========")
#             p = feat[feat["id"]=="FOODS_1_001_CA_1_validation"]
# #             print(p)
#             print(p.tail(15)[["date", "demand", "shifted_t7"]])
#             if idx==10:
#                 return None
            
            x = feat.loc[feat["date"] == date , train_cols]
            val_pred = model.predict(x, num_iteration=model.best_iteration)

            
            o[f"F{idx+1}"] = val_pred*factor
            
            
            train.loc[train["date"]==date, "demand"] = val_pred*factor
            
            
            lastmonth = lastmonth + pd.DateOffset(days=1)
            train = train[train['date'] >= str(lastmonth.values[0])]
            
        acc_o.append(o)
        acc_o = [pd.concat(acc_o)]
        
    acc_o = pd.concat(acc_o)
    return acc_o

In [136]:
ppval = []
ppeval = []
# weights = [1]
weights = [1, 1.028, 1.023, 1.018]
for w in weights:
    print("======== w",w,"==========")
    pval = predict(model, X_train, X_val, factor=w)
    peval = predict(model, pd.concat([X_train, X_val]), X_eval, factor=w)
    ppval.append(pval)
    ppeval.append(peval)

  # 30490

NDATE 28
#CHUNK 7
NDATE 28
#CHUNK 7
NDATE 28
#CHUNK 7
NDATE 28
#CHUNK 7
NDATE 28
#CHUNK 7
NDATE 28
#CHUNK 7
NDATE 28
#CHUNK 7
NDATE 28
#CHUNK 7


In [138]:
# ppval[0].shape

In [128]:
submission = pd.read_csv(data_dir+'sample_submission.csv')
submission.shape

(60980, 29)

In [139]:
sales_eval = pd.read_csv(data_dir+'sales_train_evaluation.csv')
cols = ["id"] + [f"d_{i}" for i in range(1914, 1914+28)]
true_val = sales_eval[cols]
newcols = ["id"] + [f"F{i}" for i in range(1, 1+28)]
true_val.columns = newcols


In [142]:
def avg(arr):
    avgpred = pd.DataFrame([])
    avgpred["id"] = arr[0]["id"]
    for i in range(1, 29):
        avgpred[f"F{i}"] = 0
        for idx, d in enumerate(arr):
          avgpred[f"F{i}"] += d[f"F{i}"]
        avgpred[f"F{i}"] = avgpred[f"F{i}"]/3.0

    return avgpred


def save_csv(name, d1, d2):
    d1["id"] = d1['id'].str.replace("_evaluation", "_validation")
    d2["id"] = d2['id'].str.replace("_validation", "_evaluation")

    df = pd.concat([d1, d2]) 
    print(df.shape)
    df.sort_values("id", inplace = True)
    df.reset_index(drop=True, inplace = True)
    df.to_csv(data_dir+f"../modelrecur/submission_{name}.v11.csv", index=False)
    
save_csv("pub", avg(ppval[1:]), avg(ppeval[1:]))
save_csv("prv", true_val, avg(ppeval[1:]))


(60980, 29)
(60980, 29)


In [131]:
submission.shape

(60980, 29)

In [134]:
true_val.shape

(30490, 29)